aicpu migration from sjx del first 9 ops
This commit is contained in:
parent
08aa1515d3
commit
c137e34989
|
@ -96,6 +96,8 @@
|
|||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "truncLongCastAssignment"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "knownConditionTrueFalse"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "passedByValue"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitMemberVar"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
|
||||
|
|
|
@ -129,6 +129,7 @@
|
|||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "legal/copyright"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "readability/inheritance"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "runtime/int"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/empty_if_body"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/newline"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/operators"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/comma"
|
||||
|
|
|
@ -282,6 +282,14 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
|
|||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multinomial.cc:aicpu::Generate
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_set_diag_v3.cc:aicpu::MatrixSetDiagV3CpuKernel::DoCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparseaddmm.cc:aicpu::SparseAddmmCpuKernel::SparseAddmmCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::TripletMarginLossComputeRealType
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::TripletMarginLossComputeComplexType
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::TripletMarginLossComputeRealTypeFloat16
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_transpose.cc:aicpu::SparseMatrixTransposeCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_reshape.cc:aicpu::SparseReshapeCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tensor_scatter_update.cc:aicpu::TensorScatterUpdateCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc:aicpu::ScatterNdCpuKernel::Compute
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
namespace mindspore {
|
||||
// op name. Op which not exists in operator/ops.h, so define it's name here
|
||||
constexpr auto kSparseApplyCenteredRMSPropOpName = "SparseApplyCenteredRMSProp";
|
||||
constexpr auto kSparseApplyMomentumOpName = "SparseApplyMomentum";
|
||||
constexpr auto kAbsOpName = "Abs";
|
||||
constexpr auto kAccumulateNV2OpName = "AccumulateNV2";
|
||||
constexpr auto kAdamApplyOneAssignOpName = "AdamApplyOneAssign";
|
||||
|
@ -49,7 +50,17 @@ constexpr auto kAdaptiveAvgPool2dOpName = "AdaptiveAvgPool2d";
|
|||
constexpr auto kAdaptiveAvgPool2dGradOpName = "AdaptiveAvgPool2dGrad";
|
||||
constexpr auto kAdaptiveMaxPool3DGradOpName = "AdaptiveMaxPool3DGrad";
|
||||
constexpr auto kAddNOpName = "AddN";
|
||||
constexpr auto kAddV2OpName = "AddV2";
|
||||
constexpr auto kAddOpName = "Add";
|
||||
constexpr auto kAdaptiveAvgPool3DOpName = "AdaptiveAvgPool3D";
|
||||
constexpr auto kAdaptiveMaxPool3DOpName = "AdaptiveMaxPool3D";
|
||||
constexpr auto kAdaptiveAvgPool3DGradOpName = "AdaptiveAvgPool3DGrad";
|
||||
constexpr auto kAdaptiveMaxPool2DGradOpName = "AdaptiveMaxPool2DGrad";
|
||||
constexpr auto kAdjustContrastv2OpName = "AdjustContrastv2";
|
||||
constexpr auto kAdjustHueOpName = "AdjustHue";
|
||||
constexpr auto kAdjustSaturationOpName = "AdjustSaturation";
|
||||
constexpr auto kAngleOpName = "Angle";
|
||||
constexpr auto kAffineGridGradOpName = "AffineGridGrad";
|
||||
constexpr auto kApplyAdadeltaDOpName = "ApplyAdadeltaD";
|
||||
constexpr auto kApplyAdadeltaOpName = "ApplyAdadelta";
|
||||
constexpr auto kApplyAdagradDADOpName = "ApplyAdagradDAD";
|
||||
|
@ -92,7 +103,10 @@ constexpr auto kArgMinDOpName = "ArgMinD";
|
|||
constexpr auto kArgminOpName = "Argmin";
|
||||
constexpr auto kArgMinOpName = "ArgMin";
|
||||
constexpr auto kArgminV2OpName = "ArgminV2";
|
||||
constexpr auto kArgMinWithValueOpName = "ArgMinWithValue";
|
||||
constexpr auto kArgMaxWithValueOpName = "ArgMaxWithValue";
|
||||
constexpr auto KAsinGradOpName = "AsinGrad";
|
||||
constexpr auto KAsinhGradOpName = "AsinhGrad";
|
||||
constexpr auto kAssignAddOpName = "AssignAdd";
|
||||
constexpr auto kAssignOpName = "Assign";
|
||||
constexpr auto kAssignSubOpName = "AssignSub";
|
||||
|
@ -103,6 +117,8 @@ constexpr auto kAvgPool3DOpName = "AvgPool3D";
|
|||
constexpr auto kACosOpName = "ACos";
|
||||
constexpr auto kACosGradOpName = "ACosGrad";
|
||||
constexpr auto kAcosGradOpName = "AcosGrad";
|
||||
constexpr auto kACoshOpName = "ACosh";
|
||||
constexpr auto kAcoshGradOpName = "ACoshGrad";
|
||||
constexpr auto kAvgPool3DDOpName = "AvgPool3DD";
|
||||
constexpr auto kAvgPoolGradOpName = "AvgPoolGrad";
|
||||
constexpr auto kAvgPoolGradDOpName = "AvgPoolGradD";
|
||||
|
@ -113,10 +129,12 @@ constexpr auto kBasicLSTMCellCStateGradV2OpName = "BasicLSTMCellCStateGradV2";
|
|||
constexpr auto kBasicLSTMCellInputGradOpName = "BasicLSTMCellInputGrad";
|
||||
constexpr auto kBasicLSTMCellOpName = "BasicLSTMCell";
|
||||
constexpr auto kBasicLSTMCellWeightGradOpName = "BasicLSTMCellWeightGrad";
|
||||
constexpr auto kBartlettWindowOpName = "BartlettWindow";
|
||||
constexpr auto kBatchMatMulOpName = "BatchMatMul";
|
||||
constexpr auto kBatchMatMulV2OpName = "BatchMatMulV2";
|
||||
constexpr auto kBatchNormOpName = "BatchNorm";
|
||||
constexpr auto kBatchNormGradOpName = "BatchNormGrad";
|
||||
constexpr auto kBatchNormGradGradOpName = "BatchNormGradGrad";
|
||||
constexpr auto kBatchNormGradWithActivation = "BatchNormGradWithActivation";
|
||||
constexpr auto kBatchNormGradWithAddAndActivation = "BatchNormGradWithAddAndActivation";
|
||||
constexpr auto kBatchNormWithActivation = "BatchNormWithActivation";
|
||||
|
@ -130,7 +148,9 @@ constexpr auto kBiasAddOpName = "BiasAdd";
|
|||
constexpr auto kBiasAddGradOpName = "BiasAddGrad";
|
||||
constexpr auto kIndexAddOpName = "IndexAdd";
|
||||
constexpr auto kBitwiseOrOpName = "BitwiseOr";
|
||||
constexpr auto kBincountOpName = "Bincount";
|
||||
constexpr auto kBCEWithLogitsLossOpName = "BCEWithLogitsLoss";
|
||||
constexpr auto kBlackmanWindowOpName = "BlackmanWindow";
|
||||
constexpr auto kBN2AddReluOpName = "BN2AddRelu";
|
||||
constexpr auto kBN2OpName = "BN2";
|
||||
constexpr auto kBN2ReLUOpName = "BN2Relu";
|
||||
|
@ -214,6 +234,13 @@ constexpr auto kCSRMVOpName = "CSRMV";
|
|||
constexpr auto kCSRReduceSumOpName = "CSRReduceSum";
|
||||
constexpr auto kCSRSparseMatrixToDenseOpName = "CSRSparseMatrixToDense";
|
||||
constexpr auto kCSRSparseMatrixToSparseTensorOpName = "CSRSparseMatrixToSparseTensor";
|
||||
constexpr auto kSparseMatrixMatMulOpName = "SparseMatrixMatMul";
|
||||
constexpr auto kSparseMatrixNNZOpName = "SparseMatrixNNZ";
|
||||
constexpr auto kSparseMatrixTransposeOpName = "SparseMatrixTranspose";
|
||||
constexpr auto kSparseReshapeOpName = "SparseReshape";
|
||||
constexpr auto kSparseSegmentSqrtNGradOpName = "SparseSegmentSqrtNGrad";
|
||||
constexpr auto kSparseSegmentSumWithNumSegmentsOpName = "SparseSegmentSumWithNumSegments";
|
||||
constexpr auto kSparseSegmentSqrtNWithNumSegmentsOpName = "SparseSegmentSqrtNWithNumSegments";
|
||||
constexpr auto kCTCGreedyDecoderOpName = "CTCGreedyDecoder";
|
||||
constexpr auto kCumprodOpName = "Cumprod";
|
||||
constexpr auto kCumprodDOpName = "CumprodD";
|
||||
|
@ -610,6 +637,7 @@ constexpr auto kRpcSendOpName = "RpcSend";
|
|||
constexpr auto kRpnProposalsOpName = "RpnProposals";
|
||||
constexpr auto kRpnProposalsDOpName = "RpnProposalsD";
|
||||
constexpr auto kRsqrtGradOpName = "RsqrtGrad";
|
||||
constexpr auto kSqrtGradOpName = "SqrtGrad";
|
||||
constexpr auto kRsqrtOpName = "Rsqrt";
|
||||
constexpr auto kSampleDistortedBoundingBoxExt2OpName = "SampleDistortedBoundingBoxExt2";
|
||||
constexpr auto kScaleAndTranslateOpName = "ScaleAndTranslate";
|
||||
|
@ -659,9 +687,11 @@ constexpr auto kSpaceToBatchNDDOpName = "SpaceToBatchNDD";
|
|||
constexpr auto kSpaceToDepthOpName = "SpaceToDepth";
|
||||
constexpr auto kSparseApplyAdadeltaOpName = "SparseApplyAdadelta";
|
||||
constexpr auto kSparseFillEmptyRows = "SparseFillEmptyRows";
|
||||
constexpr auto kSparseFillEmptyRowsGradOpName = "SparseFillEmptyRowsGrad";
|
||||
constexpr auto kSparseApplyAdadeltaDOpName = "SparseApplyAdadeltaD";
|
||||
constexpr auto kSparseApplyAdagradOpName = "SparseApplyAdagrad";
|
||||
constexpr auto kSparseApplyAdagradDOpName = "SparseApplyAdagradD";
|
||||
constexpr auto kSparseApplyAdagradDAOpName = "SparseApplyAdagradDA";
|
||||
constexpr auto kSparseApplyAdagradV2OpName = "SparseApplyAdagradV2";
|
||||
constexpr auto kSparseApplyAdagradV2DOpName = "SparseApplyAdagradV2D";
|
||||
constexpr auto kSparseApplyFtrlOpName = "SparseApplyFtrl";
|
||||
|
@ -670,9 +700,15 @@ constexpr auto kSparseApplyFtrlV2OpName = "SparseApplyFtrlV2";
|
|||
constexpr auto kSparseApplyFtrlV2DOpName = "SparseApplyFtrlV2D";
|
||||
constexpr auto kSparseApplyProximalAdagradDOpName = "SparseApplyProximalAdagradD";
|
||||
constexpr auto kSparseApplyProximalAdagradOpName = "SparseApplyProximalAdagrad";
|
||||
constexpr auto kSparseApplyProximalGradientDescentOpName = "SparseApplyProximalGradientDescent";
|
||||
constexpr auto kSparseApplyRMSPropOpName = "SparseApplyRMSProp";
|
||||
constexpr auto kSparseApplyRMSPropDOpName = "SparseApplyRMSPropD";
|
||||
constexpr auto kSparseAddmmOpName = "SparseAddmm";
|
||||
constexpr auto kSparseCrossOpName = "SparseCross";
|
||||
constexpr auto kSparseDenseCwiseMulOpName = "SparseDenseCwiseMul";
|
||||
constexpr auto kSparseDenseCwiseDivOpName = "SparseDenseCwiseDiv";
|
||||
constexpr auto kSparseDenseCwiseAddOpName = "SparseDenseCwiseAdd";
|
||||
constexpr auto kSparseConcatOpName = "SparseConcat";
|
||||
constexpr auto kSparseGatherV2OpName = "SparseGatherV2";
|
||||
constexpr auto kSparseSliceOpName = "SparseSlice";
|
||||
constexpr auto kSparseSoftmaxCrossEntropyWithLogitsOpName = "SparseSoftmaxCrossEntropyWithLogits";
|
||||
|
@ -711,6 +747,7 @@ constexpr auto kSubAndFilterOpName = "SubAndFilter";
|
|||
constexpr auto kSubOpName = "Sub";
|
||||
constexpr auto kSubscalarOpName = "Subscalar";
|
||||
constexpr auto kSwitchOpName = "Switch";
|
||||
constexpr auto kTanhOpName = "Tanh";
|
||||
constexpr auto kTensorAddOpName = "Add";
|
||||
constexpr auto kTensorCopySlicesOpName = "TensorCopySlices";
|
||||
constexpr auto kTensorMoveOpName = "TensorMove";
|
||||
|
@ -725,6 +762,10 @@ constexpr auto kTransposeDOpName = "TransposeD";
|
|||
constexpr auto kTruncatedNormal = "TruncatedNormal";
|
||||
constexpr auto kTruncateDivOpName = "TruncateDiv";
|
||||
constexpr auto kTruncOpName = "Trunc";
|
||||
constexpr auto kTridiagonalMatMulOpName = "TridiagonalMatMul";
|
||||
constexpr auto kTrilIndicesOpName = "TrilIndices";
|
||||
constexpr auto kTriuIndicesOpName = "TriuIndices";
|
||||
constexpr auto kTripletMarginLossOpName = "TripletMarginLoss";
|
||||
constexpr auto kUniformCandidateSamplerOpName = "UniformCandidateSampler";
|
||||
constexpr auto kLogUniformCandidateSamplerOpName = "LogUniformCandidateSampler";
|
||||
constexpr auto kUniformIntOpName = "UniformInt";
|
||||
|
@ -743,8 +784,12 @@ constexpr auto kUnsortedSegmentProdOpName = "UnsortedSegmentProd";
|
|||
constexpr auto kUnsortedSegmentProdDOpName = "UnsortedSegmentProdD";
|
||||
constexpr auto kUnsortedSegmentSumOpName = "UnsortedSegmentSum";
|
||||
constexpr auto kUnsortedSegmentSumDOpName = "UnsortedSegmentSumD";
|
||||
constexpr auto kUnravelIndexOpName = "UnravelIndex";
|
||||
constexpr auto kUpdateCacheOpName = "UpdateCache";
|
||||
constexpr auto kUpdateStateOpName = "UpdateState";
|
||||
constexpr auto kUpperBoundOpName = "UpperBound";
|
||||
constexpr auto kXlogyOpName = "Xlogy";
|
||||
constexpr auto kXdivyOpName = "Xdivy";
|
||||
constexpr auto kDynamicBroadcastToOpName = "DynamicBroadcastTo";
|
||||
constexpr auto kCheckValidOpName = "CheckValid";
|
||||
constexpr auto kSoftmaxGradFusionOpName = "SoftmaxGradFusion";
|
||||
|
|
|
@ -0,0 +1,283 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparse_matrix_mat_mul.h"
|
||||
#include <securec.h>
|
||||
#include <complex>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/allocator_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace aicpu {
|
||||
const char *SparseMatrixMatMul = "SparseMatrixMatMul";
|
||||
const int INPUT_PARAMS_NUM = 6;
|
||||
const int OUTPUT_PARAMS_NUM = 1;
|
||||
} // namespace aicpu
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SparseMatrixMatMulCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
if (ValidParam(ctx) != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("valid sparse matrix mat mul param error.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
DataType indice_type = ctx.Input(0)->GetDataType();
|
||||
DataType value_type = ctx.Input(4)->GetDataType();
|
||||
uint32_t status;
|
||||
switch (indice_type) {
|
||||
case DT_INT32:
|
||||
switch (value_type) {
|
||||
case DT_FLOAT:
|
||||
status = DoCompute<int32_t, float_t>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
status = DoCompute<int32_t, double_t>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
status = DoCompute<int32_t, complex<float_t> >(ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
status = DoCompute<int32_t, complex<double_t> >(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
case DT_INT64:
|
||||
switch (value_type) {
|
||||
case DT_FLOAT:
|
||||
status = DoCompute<int64_t, float_t>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
status = DoCompute<int64_t, double_t>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
status = DoCompute<int64_t, complex<float_t> >(ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
status = DoCompute<int64_t, complex<double_t> >(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (status != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("error in do the actual compute!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename indiceT, typename valueT>
|
||||
Eigen::Ref<const Eigen::SparseMatrix<valueT, Eigen::RowMajor, indiceT> >
|
||||
SparseMatrixMatMulCpuKernel::CreateEigenSparseMatrix(indiceT rows, indiceT cols, int64_t nnz, indiceT *row_pointers,
|
||||
indiceT *col_indices, valueT *values, bool transpose,
|
||||
bool adjoint) {
|
||||
Eigen::Map<const Eigen::SparseMatrix<valueT, Eigen::RowMajor, indiceT> > sparse_matrix(rows, cols, nnz, row_pointers,
|
||||
col_indices, values);
|
||||
// The transpose/adjoint expressions are not actually evaluated until
|
||||
// necessary. Hence we don't create copies or modify the input matrix
|
||||
// inplace.
|
||||
if (transpose) {
|
||||
return sparse_matrix.transpose();
|
||||
}
|
||||
if (adjoint) {
|
||||
return sparse_matrix.adjoint();
|
||||
}
|
||||
return sparse_matrix;
|
||||
}
|
||||
|
||||
uint32_t SparseMatrixMatMulCpuKernel::ValidParam(CpuKernelContext &ctx) {
|
||||
KERNEL_LOG_DEBUG("Start to execute ValidParam.");
|
||||
// valid input and output nullptr
|
||||
if (NormalCheck(ctx, INPUT_PARAMS_NUM, OUTPUT_PARAMS_NUM) != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
// check if the matrix can mul
|
||||
DataType dt = ctx.Input(0)->GetDataType(); // dense shape x1
|
||||
uint32_t checkStatus;
|
||||
switch (dt) {
|
||||
case DT_INT32:
|
||||
checkStatus = CheckMatMul<int32_t>(ctx);
|
||||
break;
|
||||
case DT_INT64:
|
||||
checkStatus = CheckMatMul<int64_t>(ctx);
|
||||
break;
|
||||
default:
|
||||
// KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (checkStatus != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("the two input matrixs cannot mul cause their dim!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SparseMatrixMatMulCpuKernel::CheckMatMul(CpuKernelContext &ctx) {
|
||||
KERNEL_LOG_DEBUG("check if the matrix can mul");
|
||||
|
||||
const int rank = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
const int row_dim = (rank == 2) ? 0 : 1;
|
||||
Tensor *dense_shape_x1 = ctx.Input(0);
|
||||
T *shape_x1 = static_cast<T *>(dense_shape_x1->GetData());
|
||||
std::vector<int64_t> shape_x2 = ctx.Input(5)->GetTensorShape()->GetDimSizes();
|
||||
|
||||
bool transpose_a = false;
|
||||
bool transpose_b = false;
|
||||
bool adjoint_a = false;
|
||||
bool adjoint_b = false;
|
||||
|
||||
if (ctx.GetAttr("transpose_x1") != nullptr) {
|
||||
transpose_a = ctx.GetAttr("transpose_x1")->GetBool();
|
||||
}
|
||||
if (ctx.GetAttr("transpose_x2") != nullptr) {
|
||||
transpose_b = ctx.GetAttr("transpose_x2")->GetBool();
|
||||
}
|
||||
if (ctx.GetAttr("adjoint_x1") != nullptr) {
|
||||
adjoint_a = ctx.GetAttr("adjoint_x1")->GetBool();
|
||||
}
|
||||
if (ctx.GetAttr("adjoint_x2") != nullptr) {
|
||||
adjoint_b = ctx.GetAttr("adjoint_x2")->GetBool();
|
||||
}
|
||||
|
||||
T x1_col = (transpose_a || adjoint_a) ? shape_x1[row_dim] : shape_x1[row_dim + 1];
|
||||
T x2_row = (transpose_b || adjoint_b) ? shape_x2[row_dim + 1] : shape_x2[row_dim];
|
||||
if (x1_col != x2_row) {
|
||||
KERNEL_LOG_ERROR("x1's col is no equal x2's row, cannot do mat mul!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename indiceT, typename valueT>
|
||||
uint32_t SparseMatrixMatMulCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
using Matrix = Eigen::Matrix<valueT, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
|
||||
|
||||
indiceT batch_size = ctx.Input(1)->NumElements() - 1;
|
||||
std::vector<Matrix> results(batch_size);
|
||||
int shift = (ctx.Input(0)->NumElements() == 2) ? 0 : 1;
|
||||
|
||||
indiceT row_x1 = *(static_cast<indiceT *>(ctx.Input(0)->GetData()) + shift);
|
||||
indiceT col_x1 = *(static_cast<indiceT *>(ctx.Input(0)->GetData()) + shift + 1);
|
||||
indiceT *batch_pointers_x1 = static_cast<indiceT *>(ctx.Input(1)->GetData());
|
||||
indiceT *row_pointers_x1 = static_cast<indiceT *>(ctx.Input(2)->GetData());
|
||||
indiceT *col_indices_x1 = static_cast<indiceT *>(ctx.Input(3)->GetData());
|
||||
valueT *value_x1 = static_cast<valueT *>(ctx.Input(4)->GetData());
|
||||
|
||||
std::vector<int64_t> shape_x2 = ctx.Input(5)->GetTensorShape()->GetDimSizes();
|
||||
const int rank = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
const int row_dim = (rank == 2) ? 0 : 1;
|
||||
indiceT row_x2 = shape_x2[row_dim];
|
||||
indiceT col_x2 = shape_x2[row_dim + 1];
|
||||
valueT *value_x2 = static_cast<valueT *>(ctx.Input(5)->GetData());
|
||||
|
||||
bool transpose_a = false;
|
||||
bool transpose_b = false;
|
||||
bool adjoint_a = false;
|
||||
bool adjoint_b = false;
|
||||
bool transpose_output = false;
|
||||
bool conjugate_output = false;
|
||||
if (ctx.GetAttr("transpose_x1") != nullptr) {
|
||||
transpose_a = ctx.GetAttr("transpose_x1")->GetBool();
|
||||
}
|
||||
if (ctx.GetAttr("transpose_x2") != nullptr) {
|
||||
transpose_b = ctx.GetAttr("transpose_x2")->GetBool();
|
||||
}
|
||||
if (ctx.GetAttr("adjoint_x1") != nullptr) {
|
||||
adjoint_a = ctx.GetAttr("adjoint_x1")->GetBool();
|
||||
}
|
||||
if (ctx.GetAttr("adjoint_x2") != nullptr) {
|
||||
adjoint_b = ctx.GetAttr("adjoint_x2")->GetBool();
|
||||
}
|
||||
if (ctx.GetAttr("transpose_output") != nullptr) {
|
||||
transpose_output = ctx.GetAttr("transpose_output")->GetBool();
|
||||
}
|
||||
if (ctx.GetAttr("conjugate_output") != nullptr) {
|
||||
conjugate_output = ctx.GetAttr("conjugate_output")->GetBool();
|
||||
}
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
max_core_num = std::min(max_core_num, (uint32_t)batch_size);
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max core num cannot be zero");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, batch_size, batch_size / max_core_num,
|
||||
[&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
int64_t nnz_x1 = batch_pointers_x1[i + 1] - batch_pointers_x1[i];
|
||||
indiceT *row_pointers_x1_batch_i = row_pointers_x1 + (row_x1 + 1) * i;
|
||||
indiceT *col_indices_x1_batch_i = col_indices_x1 + batch_pointers_x1[i];
|
||||
valueT *value_x1_batch_i = value_x1 + batch_pointers_x1[i];
|
||||
auto x1_sparse_matrix = CreateEigenSparseMatrix<indiceT, valueT>(
|
||||
row_x1, col_x1, nnz_x1, row_pointers_x1_batch_i, col_indices_x1_batch_i,
|
||||
value_x1_batch_i, transpose_a, adjoint_a);
|
||||
|
||||
Eigen::Map<Matrix> x2_dense_matrix(value_x2 + col_x2 * row_x2 * i, row_x2, col_x2);
|
||||
Matrix temp;
|
||||
if (transpose_b) {
|
||||
temp = x1_sparse_matrix * x2_dense_matrix.transpose();
|
||||
} else if (adjoint_b) {
|
||||
temp = x1_sparse_matrix * x2_dense_matrix.adjoint();
|
||||
} else {
|
||||
temp = x1_sparse_matrix * x2_dense_matrix;
|
||||
}
|
||||
|
||||
if (transpose_output) {
|
||||
results[i] = temp.transpose();
|
||||
} else if (conjugate_output) {
|
||||
results[i] = temp.conjugate();
|
||||
} else
|
||||
results[i] = temp;
|
||||
}
|
||||
}),
|
||||
"SparseMatrixMatMul Compute failed.");
|
||||
|
||||
// computer result_row_pointers|result_col_indices|result_values data
|
||||
indiceT row_output, col_output;
|
||||
row_output = results[0].rows();
|
||||
col_output = results[0].cols();
|
||||
for (int i = 0; i < batch_size; i++) {
|
||||
valueT *output_values_data = static_cast<valueT *>(ctx.Output(0)->GetData());
|
||||
std::copy(results[i].data(), results[i].data() + row_output * col_output,
|
||||
output_values_data + i * row_output * col_output);
|
||||
}
|
||||
|
||||
KERNEL_LOG_DEBUG("DoCompute end!!");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
// register the opetaor
|
||||
REGISTER_CPU_KERNEL(SparseMatrixMatMul, SparseMatrixMatMulCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,46 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSEMATRIXSPARSEMATMUL_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSEMATRIXSPARSEMATMUL_H_
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "Eigen/SparseCore"
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class SparseMatrixMatMulCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~SparseMatrixMatMulCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t ValidParam(CpuKernelContext &ctx);
|
||||
// check if the matrix can mul
|
||||
template <typename T>
|
||||
uint32_t CheckMatMul(CpuKernelContext &ctx);
|
||||
// create eigen sparsematrix with eigen::map
|
||||
template <typename indiceT, typename valueT>
|
||||
Eigen::Ref<const Eigen::SparseMatrix<valueT, Eigen::RowMajor, indiceT> > CreateEigenSparseMatrix(
|
||||
indiceT rows, indiceT cols, int64_t nnz, indiceT *row_pointers, indiceT *col_indices, valueT *values,
|
||||
bool transpose, bool adjoint);
|
||||
// do the actual complute
|
||||
template <typename indiceT, typename valueT>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,86 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparse_matrix_nnz.h"
|
||||
#include <securec.h>
|
||||
#include <complex>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/allocator_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace aicpu {
|
||||
const char *SparseMatrixNNZ = "SparseMatrixNNZ";
|
||||
const int INPUT_PARAMS_NUM = 5;
|
||||
const int OUTPUT_PARAMS_NUM = 1;
|
||||
} // namespace aicpu
|
||||
namespace aicpu {
|
||||
|
||||
uint32_t SparseMatrixNNZCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
if (NormalCheck(ctx, INPUT_PARAMS_NUM, OUTPUT_PARAMS_NUM) != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
DataType indice_type = ctx.Input(1)->GetDataType();
|
||||
uint32_t status;
|
||||
switch (indice_type) {
|
||||
case DT_INT32:
|
||||
status = DoCompute<int32_t>(ctx);
|
||||
break;
|
||||
case DT_INT64:
|
||||
status = DoCompute<int64_t>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("data type of batch pointers is not int32 or int64");
|
||||
status = KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (status != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("error in do the actual compute!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename indiceT>
|
||||
uint32_t SparseMatrixNNZCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
const indiceT batch_size = ctx.Input(1)->NumElements() - 1;
|
||||
// define some temp arrays to store the output tensor data
|
||||
int32_t result_nnz[batch_size];
|
||||
// do computer
|
||||
indiceT *batch_pointers_x = static_cast<indiceT *>(ctx.Input(1)->GetData());
|
||||
indiceT curr = 0;
|
||||
for (int i = 1; i < batch_size + 1; i++) {
|
||||
result_nnz[i - 1] = batch_pointers_x[i] - curr;
|
||||
// update curr
|
||||
curr = batch_pointers_x[i];
|
||||
}
|
||||
// write result
|
||||
int32_t *output_y = static_cast<int32_t *>(ctx.Output(0)->GetData());
|
||||
std::copy(result_nnz, result_nnz + (int32_t)batch_size, output_y);
|
||||
|
||||
KERNEL_LOG_DEBUG("DoCompute end!!");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
// register the opetaor
|
||||
REGISTER_CPU_KERNEL(SparseMatrixNNZ, SparseMatrixNNZCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSEMATRIXNNZ_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSEMATRIXNNZ_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class SparseMatrixNNZCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~SparseMatrixNNZCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
// do the actual complute
|
||||
template <typename indiceT>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,337 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* UnSparseMatrixTranspose required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparse_matrix_transpose.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include <numeric>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace aicpu {
|
||||
const uint32_t kInputNum = 5;
|
||||
const uint32_t kOutputNum = 5;
|
||||
const uint32_t kzero = 0;
|
||||
const uint32_t kone = 1;
|
||||
const uint32_t ktwo = 2;
|
||||
const uint32_t kthree = 3;
|
||||
const uint32_t kfour = 4;
|
||||
const uint32_t krankwithbatch = 3;
|
||||
const char *SPARSEMATRIXTRANSPOSE = "SparseMatrixTranspose";
|
||||
} // namespace aicpu
|
||||
namespace aicpu {
|
||||
uint32_t SparseMatrixTransposeCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseMatrixTranspose normal check failed.");
|
||||
DataType indice_type = ctx.Input(0)->GetDataType();
|
||||
DataType value_type = ctx.Input(4)->GetDataType();
|
||||
uint32_t status;
|
||||
switch (indice_type) {
|
||||
case DT_INT32:
|
||||
switch (value_type) {
|
||||
case DT_UINT8:
|
||||
status = SparseMatrixTransposeCompute<int32_t, uint8_t>(ctx);
|
||||
break;
|
||||
case DT_UINT16:
|
||||
status = SparseMatrixTransposeCompute<int32_t, uint16_t>(ctx);
|
||||
break;
|
||||
case DT_UINT32:
|
||||
status = SparseMatrixTransposeCompute<int32_t, uint32_t>(ctx);
|
||||
break;
|
||||
case DT_UINT64:
|
||||
status = SparseMatrixTransposeCompute<int32_t, uint64_t>(ctx);
|
||||
break;
|
||||
case DT_INT8:
|
||||
status = SparseMatrixTransposeCompute<int32_t, int8_t>(ctx);
|
||||
break;
|
||||
case DT_INT16:
|
||||
status = SparseMatrixTransposeCompute<int32_t, int16_t>(ctx);
|
||||
break;
|
||||
case DT_INT32:
|
||||
status = SparseMatrixTransposeCompute<int32_t, int32_t>(ctx);
|
||||
break;
|
||||
case DT_INT64:
|
||||
status = SparseMatrixTransposeCompute<int32_t, int64_t>(ctx);
|
||||
break;
|
||||
case DT_FLOAT16:
|
||||
status = SparseMatrixTransposeCompute<int32_t, Eigen::half>(ctx);
|
||||
break;
|
||||
case DT_FLOAT:
|
||||
status = SparseMatrixTransposeCompute<int32_t, float_t>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
status = SparseMatrixTransposeCompute<int32_t, double_t>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
status = SparseMatrixTransposeComputecomplex<int32_t, complex<float_t>>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
status = SparseMatrixTransposeComputecomplex<int32_t, complex<double_t>>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("data type of x_value is not required");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
case DT_INT64:
|
||||
switch (value_type) {
|
||||
case DT_UINT8:
|
||||
status = SparseMatrixTransposeCompute<int64_t, uint8_t>(ctx);
|
||||
break;
|
||||
case DT_UINT16:
|
||||
status = SparseMatrixTransposeCompute<int64_t, uint16_t>(ctx);
|
||||
break;
|
||||
case DT_UINT32:
|
||||
status = SparseMatrixTransposeCompute<int64_t, uint32_t>(ctx);
|
||||
break;
|
||||
case DT_UINT64:
|
||||
status = SparseMatrixTransposeCompute<int64_t, uint64_t>(ctx);
|
||||
break;
|
||||
case DT_INT8:
|
||||
status = SparseMatrixTransposeCompute<int64_t, int8_t>(ctx);
|
||||
break;
|
||||
case DT_INT16:
|
||||
status = SparseMatrixTransposeCompute<int64_t, int16_t>(ctx);
|
||||
break;
|
||||
case DT_INT32:
|
||||
status = SparseMatrixTransposeCompute<int64_t, int32_t>(ctx);
|
||||
break;
|
||||
case DT_INT64:
|
||||
status = SparseMatrixTransposeCompute<int64_t, int64_t>(ctx);
|
||||
break;
|
||||
case DT_FLOAT16:
|
||||
status = SparseMatrixTransposeCompute<int64_t, Eigen::half>(ctx);
|
||||
break;
|
||||
case DT_FLOAT:
|
||||
status = SparseMatrixTransposeCompute<int64_t, float_t>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
status = SparseMatrixTransposeCompute<int64_t, double_t>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
status = SparseMatrixTransposeComputecomplex<int64_t, complex<float_t>>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
status = SparseMatrixTransposeComputecomplex<int64_t, complex<double_t>>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("data type of x_value is not required");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (status != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("error in do the actual compute!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename indiceT, typename valueT>
|
||||
uint32_t SparseMatrixTransposeCpuKernel::SparseMatrixTransposeCompute(CpuKernelContext &ctx) {
|
||||
indiceT *x_dense_shape = static_cast<indiceT *>(ctx.Input(0)->GetData());
|
||||
indiceT *x_batch_pointers = static_cast<indiceT *>(ctx.Input(1)->GetData());
|
||||
indiceT *x_row_pointers = static_cast<indiceT *>(ctx.Input(2)->GetData());
|
||||
indiceT *x_col_indices = static_cast<indiceT *>(ctx.Input(3)->GetData());
|
||||
valueT *x_values = static_cast<valueT *>(ctx.Input(4)->GetData());
|
||||
bool conjugate = (ctx.GetAttr("conjugate") == nullptr) ? false : ctx.GetAttr("conjugate")->GetBool();
|
||||
indiceT *y_dense_shape = static_cast<indiceT *>(ctx.Output(0)->GetData());
|
||||
indiceT *y_batch_pointers = static_cast<indiceT *>(ctx.Output(1)->GetData());
|
||||
indiceT *y_row_pointers = static_cast<indiceT *>(ctx.Output(2)->GetData());
|
||||
indiceT *y_col_indices = static_cast<indiceT *>(ctx.Output(3)->GetData());
|
||||
valueT *y_values = static_cast<valueT *>(ctx.Output(4)->GetData());
|
||||
auto rank = ctx.Input(0)->NumElements();
|
||||
if (rank == krankwithbatch) {
|
||||
y_dense_shape[0] = x_dense_shape[0];
|
||||
y_dense_shape[1] = x_dense_shape[ktwo];
|
||||
y_dense_shape[ktwo] = x_dense_shape[1];
|
||||
} else {
|
||||
y_dense_shape[0] = x_dense_shape[1];
|
||||
y_dense_shape[1] = x_dense_shape[0];
|
||||
}
|
||||
auto batch_pointers = ctx.Input(1)->NumElements();
|
||||
for (int i = 0; i < batch_pointers; ++i) {
|
||||
y_batch_pointers[i] = x_batch_pointers[i];
|
||||
}
|
||||
|
||||
auto num_rows = x_dense_shape[rank - 2];
|
||||
auto num_cols = x_dense_shape[rank - 1];
|
||||
auto num_batch = ctx.Input(1)->NumElements() - 1;
|
||||
int y_part_row_pointers[num_cols + 1];
|
||||
int part_row_pointers[num_rows + 1];
|
||||
|
||||
for (int j = 0; j < num_batch; ++j) {
|
||||
int n = x_batch_pointers[j + 1] - x_batch_pointers[j];
|
||||
valueT part_values[n];
|
||||
indiceT part_col_indices[n];
|
||||
indiceT y_part_col_indices[n];
|
||||
valueT y_part_values[n];
|
||||
for (int i = 0; i < num_cols + 1; ++i) {
|
||||
y_part_row_pointers[i] = 0;
|
||||
}
|
||||
for (int k = 0; k < num_rows + 1; ++k) {
|
||||
part_row_pointers[k] = x_row_pointers[(num_rows + 1) * j + k];
|
||||
}
|
||||
for (int k = 0; k < n; ++k) {
|
||||
part_values[k] = x_values[x_batch_pointers[j] + k];
|
||||
part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
|
||||
}
|
||||
for (int64_t i = 0; i < n; ++i) {
|
||||
y_part_row_pointers[part_col_indices[i] + 1] += 1;
|
||||
}
|
||||
std::partial_sum(y_part_row_pointers, y_part_row_pointers + num_cols + 1, y_part_row_pointers);
|
||||
for (int k = 0; k < num_cols + 1; ++k) {
|
||||
y_row_pointers[(num_cols + 1) * j + k] = y_part_row_pointers[k];
|
||||
}
|
||||
|
||||
for (int k = 0; k < n; ++k) {
|
||||
part_values[k] = x_values[x_batch_pointers[j] + k];
|
||||
part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
|
||||
}
|
||||
std::vector<int> current_col_count(num_cols);
|
||||
for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
|
||||
const int64_t row_begin = part_row_pointers[row_idx];
|
||||
const int64_t row_end = part_row_pointers[row_idx + 1];
|
||||
for (int64_t i = row_begin; i < row_end; ++i) {
|
||||
const int col_idx = part_col_indices[i];
|
||||
const int64_t offset = y_part_row_pointers[col_idx] + current_col_count[col_idx];
|
||||
y_part_col_indices[offset] = row_idx;
|
||||
y_part_values[offset] = part_values[i];
|
||||
current_col_count[col_idx] += 1;
|
||||
}
|
||||
}
|
||||
for (int k = 0; k < n; ++k) {
|
||||
y_values[x_batch_pointers[j] + k] = y_part_values[k];
|
||||
y_col_indices[x_batch_pointers[j] + k] = y_part_col_indices[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (conjugate == false) {
|
||||
}
|
||||
auto output = ctx.Output(2);
|
||||
auto output_shape = output->GetTensorShape();
|
||||
if (rank == ktwo) {
|
||||
output_shape->SetDimSizes({num_cols + 1});
|
||||
} else {
|
||||
output_shape->SetDimSizes({x_dense_shape[0] * (num_cols + 1)});
|
||||
}
|
||||
output->SetTensorShape(output_shape.get());
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename indiceT, typename valueT>
|
||||
uint32_t SparseMatrixTransposeCpuKernel::SparseMatrixTransposeComputecomplex(CpuKernelContext &ctx) {
|
||||
indiceT *x_dense_shape = static_cast<indiceT *>(ctx.Input(0)->GetData());
|
||||
indiceT *x_batch_pointers = static_cast<indiceT *>(ctx.Input(1)->GetData());
|
||||
indiceT *x_row_pointers = static_cast<indiceT *>(ctx.Input(2)->GetData());
|
||||
indiceT *x_col_indices = static_cast<indiceT *>(ctx.Input(3)->GetData());
|
||||
valueT *x_values = static_cast<valueT *>(ctx.Input(4)->GetData());
|
||||
bool conjugate = (ctx.GetAttr("conjugate") == nullptr) ? false : ctx.GetAttr("conjugate")->GetBool();
|
||||
indiceT *y_dense_shape = static_cast<indiceT *>(ctx.Output(0)->GetData());
|
||||
indiceT *y_batch_pointers = static_cast<indiceT *>(ctx.Output(1)->GetData());
|
||||
indiceT *y_row_pointers = static_cast<indiceT *>(ctx.Output(2)->GetData());
|
||||
indiceT *y_col_indices = static_cast<indiceT *>(ctx.Output(3)->GetData());
|
||||
valueT *y_values = static_cast<valueT *>(ctx.Output(4)->GetData());
|
||||
auto rank = ctx.Input(0)->NumElements();
|
||||
if (rank == krankwithbatch) {
|
||||
y_dense_shape[0] = x_dense_shape[0];
|
||||
y_dense_shape[1] = x_dense_shape[ktwo];
|
||||
y_dense_shape[ktwo] = x_dense_shape[1];
|
||||
} else {
|
||||
y_dense_shape[0] = x_dense_shape[1];
|
||||
y_dense_shape[1] = x_dense_shape[0];
|
||||
}
|
||||
auto batch_pointers = ctx.Input(1)->NumElements();
|
||||
for (int i = 0; i < batch_pointers; ++i) {
|
||||
y_batch_pointers[i] = x_batch_pointers[i];
|
||||
}
|
||||
|
||||
auto num_rows = x_dense_shape[rank - 2];
|
||||
auto num_cols = x_dense_shape[rank - 1];
|
||||
auto num_batch = ctx.Input(1)->NumElements() - 1;
|
||||
int y_part_row_pointers[num_cols + 1];
|
||||
int part_row_pointers[num_rows + 1];
|
||||
|
||||
for (int j = 0; j < num_batch; ++j) {
|
||||
int n = x_batch_pointers[j + 1] - x_batch_pointers[j];
|
||||
valueT part_values[n];
|
||||
indiceT part_col_indices[n];
|
||||
indiceT y_part_col_indices[n];
|
||||
valueT y_part_values[n];
|
||||
for (int i = 0; i < num_cols + 1; ++i) {
|
||||
y_part_row_pointers[i] = 0;
|
||||
}
|
||||
for (int k = 0; k < num_rows + 1; ++k) {
|
||||
part_row_pointers[k] = x_row_pointers[(num_rows + 1) * j + k];
|
||||
}
|
||||
for (int k = 0; k < n; ++k) {
|
||||
part_values[k] = x_values[x_batch_pointers[j] + k];
|
||||
part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
|
||||
}
|
||||
for (int64_t i = 0; i < n; ++i) {
|
||||
y_part_row_pointers[part_col_indices[i] + 1] += 1;
|
||||
}
|
||||
std::partial_sum(y_part_row_pointers, y_part_row_pointers + num_cols + 1, y_part_row_pointers);
|
||||
for (int k = 0; k < num_cols + 1; ++k) {
|
||||
y_row_pointers[(num_cols + 1) * j + k] = y_part_row_pointers[k];
|
||||
}
|
||||
|
||||
for (int k = 0; k < n; ++k) {
|
||||
part_values[k] = x_values[x_batch_pointers[j] + k];
|
||||
part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
|
||||
}
|
||||
std::vector<int> current_col_count(num_cols);
|
||||
for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
|
||||
const int64_t row_begin = part_row_pointers[row_idx];
|
||||
const int64_t row_end = part_row_pointers[row_idx + 1];
|
||||
for (int64_t i = row_begin; i < row_end; ++i) {
|
||||
const int col_idx = part_col_indices[i];
|
||||
const int64_t offset = y_part_row_pointers[col_idx] + current_col_count[col_idx];
|
||||
y_part_col_indices[offset] = row_idx;
|
||||
y_part_values[offset] = part_values[i];
|
||||
current_col_count[col_idx] += 1;
|
||||
}
|
||||
}
|
||||
for (int k = 0; k < n; ++k) {
|
||||
y_values[x_batch_pointers[j] + k] = y_part_values[k];
|
||||
y_col_indices[x_batch_pointers[j] + k] = y_part_col_indices[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (conjugate == true) {
|
||||
for (int i = 0; i < ctx.Input(kfour)->GetTensorShape()->NumElements(); ++i) {
|
||||
y_values[i] = std::conj(y_values[i]);
|
||||
}
|
||||
}
|
||||
auto output = ctx.Output(2);
|
||||
auto output_shape = output->GetTensorShape();
|
||||
if (rank == ktwo) {
|
||||
output_shape->SetDimSizes({num_cols + 1});
|
||||
} else {
|
||||
output_shape->SetDimSizes({x_dense_shape[0] * (num_cols + 1)});
|
||||
}
|
||||
output->SetTensorShape(output_shape.get());
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(SPARSEMATRIXTRANSPOSE, SparseMatrixTransposeCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSEMATRIXTRANSPOSE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSEMATRIXTRANSPOSE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/sparse_tensor.h"
|
||||
#include "Eigen/SparseCore"
|
||||
|
||||
namespace aicpu {
|
||||
class SparseMatrixTransposeCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~SparseMatrixTransposeCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t SparseMatrixTransposeParamCheck(CpuKernelContext &ctx);
|
||||
template <typename indiceT, typename valueT>
|
||||
uint32_t SparseMatrixTransposeCompute(CpuKernelContext &ctx);
|
||||
template <typename indiceT, typename valueT>
|
||||
uint32_t SparseMatrixTransposeComputecomplex(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,180 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparse_reshape.h"
|
||||
#include <vector>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "securec.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
constexpr uint32_t kSparseReshapeInputNum = 3;
|
||||
constexpr uint32_t kSparseReshapeOutputNum = 2;
|
||||
const char *kSparseReshape = "SparseReshape";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNumSameShape = 24 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
void SparseReshapeCpuKernel::SpecialCompute(int64_t start, int64_t end, const int64_t *in0, int64_t *out0,
|
||||
const int64_t *input_strides, const int64_t *output_strides,
|
||||
const int64_t input_rank, const int64_t output_rank) {
|
||||
for (int i = start; i < end; i++) {
|
||||
int64_t id = 0;
|
||||
for (int j = 0; j < input_rank; j++) {
|
||||
id += *(in0 + i * input_rank + j) * input_strides[j];
|
||||
}
|
||||
for (int j = 0; j < output_rank; j++) {
|
||||
*(out0 + i * output_rank + j) = id / output_strides[j];
|
||||
id %= output_strides[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t SparseReshapeCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSparseReshapeInputNum, kSparseReshapeOutputNum), "[%s] check params failed.",
|
||||
kSparseReshape);
|
||||
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *input_2 = ctx.Input(2);
|
||||
Tensor *output_0 = ctx.Output(0);
|
||||
Tensor *output_1 = ctx.Output(1);
|
||||
|
||||
KERNEL_CHECK_FALSE(
|
||||
(input_0->GetDataType() == DT_INT64 && input_1->GetDataType() == DT_INT64 && input_2->GetDataType() == DT_INT64 &&
|
||||
output_0->GetDataType() == DT_INT64 && output_1->GetDataType() == DT_INT64),
|
||||
KERNEL_STATUS_INNER_ERROR, "the data of SparseReshape kernel must be DT_INT64.");
|
||||
KERNEL_CHECK_FALSE((input_0->GetTensorShape()->GetDimSize(1) == input_1->GetTensorShape()->GetDimSize(0)),
|
||||
KERNEL_STATUS_INNER_ERROR, "Input tensor rank must match input shape length.");
|
||||
|
||||
int64_t *in0 = reinterpret_cast<int64_t *>(input_0->GetData());
|
||||
int64_t *in1 = reinterpret_cast<int64_t *>(input_1->GetData());
|
||||
int64_t *in2 = reinterpret_cast<int64_t *>(input_2->GetData());
|
||||
int64_t *out0 = reinterpret_cast<int64_t *>(output_0->GetData());
|
||||
int64_t *out1 = reinterpret_cast<int64_t *>(output_1->GetData());
|
||||
|
||||
const int64_t input_rank = input_1->NumElements();
|
||||
const int64_t output_rank = input_2->NumElements();
|
||||
const int64_t nnz = input_0->GetTensorShape()->GetDimSize(0);
|
||||
int64_t dense_size = 1;
|
||||
int64_t product = 1;
|
||||
int64_t out_num = 1;
|
||||
int unknown_index = -1;
|
||||
|
||||
for (int i = 0; i < input_rank; i++) {
|
||||
dense_size *= *(in1 + i);
|
||||
}
|
||||
for (int d = 0; d < output_rank; d++) {
|
||||
const int64_t size = *(in2 + d);
|
||||
if (size == -1) {
|
||||
KERNEL_CHECK_FALSE((unknown_index == -1), KERNEL_STATUS_INNER_ERROR,
|
||||
"only one output dimension may be -1, "
|
||||
"not both [%d] and [%d]",
|
||||
unknown_index, d);
|
||||
unknown_index = d;
|
||||
} else {
|
||||
KERNEL_CHECK_FALSE((size >= 0), KERNEL_STATUS_INNER_ERROR, "size [%d] must be non-negative, not [%ld]", d, size);
|
||||
product *= size;
|
||||
*(out1 + d) = size;
|
||||
out_num *= size;
|
||||
}
|
||||
}
|
||||
|
||||
if (unknown_index != -1) {
|
||||
KERNEL_CHECK_FALSE((product >= 0), KERNEL_STATUS_INNER_ERROR,
|
||||
"reshape cannot infer the missing "
|
||||
"input size for an empty tensor unless all "
|
||||
"specified input sizes are non-zero");
|
||||
const int64_t missing = dense_size / product;
|
||||
KERNEL_CHECK_FALSE((product * missing == dense_size), KERNEL_STATUS_INNER_ERROR,
|
||||
"Input to reshape is a SparseTensor with [%ld]"
|
||||
" dense values, but the requested shape requires"
|
||||
" a multiple of [%ld].",
|
||||
dense_size, product);
|
||||
out_num *= missing;
|
||||
*(out1 + unknown_index) = missing;
|
||||
}
|
||||
|
||||
KERNEL_CHECK_FALSE((out_num == dense_size), KERNEL_STATUS_INNER_ERROR,
|
||||
"Input to reshape is a tensor with [%ld]"
|
||||
" dense values, but the requested shape has [%ld].",
|
||||
dense_size, out_num);
|
||||
|
||||
int64_t input_size = input_0->GetDataSize();
|
||||
int64_t output_size = output_0->GetDataSize();
|
||||
if (input_size == output_size && input_rank == output_rank) {
|
||||
bool flag = true;
|
||||
for (int64_t i = 0; i < input_rank; ++i) {
|
||||
if (*(in1 + i) != *(out1 + i)) {
|
||||
flag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (flag) {
|
||||
auto mem_ret = memcpy_s(out0, output_size, in0, input_size);
|
||||
KERNEL_CHECK_FALSE(mem_ret == EOK, KERNEL_STATUS_INNER_ERROR,
|
||||
"[%s] memcpy_s to output failed, destMax [%ld], count [%ld].", kSparseReshape, output_size,
|
||||
input_size);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
}
|
||||
|
||||
if (nnz <= 0) return KERNEL_STATUS_OK;
|
||||
int64_t *input_strides = new int64_t[input_rank];
|
||||
int64_t *output_strides = new int64_t[output_rank];
|
||||
|
||||
if (input_rank > 0) {
|
||||
input_strides[input_rank - 1] = 1;
|
||||
for (int d = input_rank - 2; d >= 0; d--) {
|
||||
input_strides[d] = input_strides[d + 1] * *(in1 + d + 1);
|
||||
}
|
||||
}
|
||||
if (output_rank > 0) {
|
||||
output_strides[output_rank - 1] = 1;
|
||||
for (int d = output_rank - 2; d >= 0; d--) {
|
||||
output_strides[d] = output_strides[d + 1] * *(out1 + d + 1);
|
||||
}
|
||||
}
|
||||
if (nnz * input_rank >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
KERNEL_CHECK_FALSE(max_core_num != 0, KERNEL_STATUS_INNER_ERROR, "core num should not be 0.");
|
||||
if (nnz * input_rank <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
if (max_core_num > nnz) {
|
||||
max_core_num = nnz;
|
||||
}
|
||||
auto sharder_sparse_reshape = [&](int64_t start, int64_t end) {
|
||||
SpecialCompute(start, end, in0, out0, input_strides, output_strides, input_rank, output_rank);
|
||||
};
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, nnz, nnz / max_core_num, sharder_sparse_reshape),
|
||||
"SparseReshape Compute failed.");
|
||||
} else {
|
||||
SpecialCompute(0, nnz, in0, out0, input_strides, output_strides, input_rank, output_rank);
|
||||
}
|
||||
|
||||
delete[] input_strides;
|
||||
delete[] output_strides;
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSparseReshape, SparseReshapeCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,33 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_RESHAPE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSE_RESHAPE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SparseReshapeCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~SparseReshapeCpuKernel() override = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
void SpecialCompute(int64_t start, int64_t end, const int64_t *in0, int64_t *out0, const int64_t *input_strides,
|
||||
const int64_t *output_strides, const int64_t input_rank, const int64_t output_rank);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,136 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparse_segment_sqrt_n_grad.h"
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 4;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *SparseSegmentSqrtNGrad = "SparseSegmentSqrtNGrad";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SparseSegmentSqrtNGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"SparseSegmentSqrtNGrad check input and output number failed.");
|
||||
Tensor *inputx = ctx.Input(0);
|
||||
Tensor *input_indices = ctx.Input(1);
|
||||
Tensor *input_segment_ids = ctx.Input(2);
|
||||
Tensor *input_output_dim = ctx.Input(3);
|
||||
|
||||
auto data_type0 = inputx->GetDataType();
|
||||
auto data_type1 = input_indices->GetDataType();
|
||||
auto data_type2 = input_segment_ids->GetDataType();
|
||||
auto data_type3 = input_output_dim->GetDataType();
|
||||
|
||||
if (data_type0 != DT_FLOAT && data_type0 != DT_DOUBLE && data_type0 != DT_FLOAT16) {
|
||||
KERNEL_LOG_ERROR("SparseSegmentSqrtNGrad kernel data type [%u] not support.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (data_type1 != data_type2 || data_type1 != data_type3 || data_type1 != DT_INT32) {
|
||||
KERNEL_LOG_ERROR("SparseSegmentSqrtNGrad kernel data type [%u] not support.", data_type1);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
auto shape0 = inputx->GetTensorShape();
|
||||
auto shape1 = input_indices->GetTensorShape();
|
||||
auto shape2 = input_segment_ids->GetTensorShape();
|
||||
auto scalarshape = input_output_dim->GetTensorShape();
|
||||
if (shape0->GetDims() < 1) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input0's rank less than 1.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (shape1->NumElements() != shape2->NumElements()) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input1&input2's ranks mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (data_type0 == DT_FLOAT) {
|
||||
return ComputeKernal<float>(ctx);
|
||||
} else if (data_type0 == DT_DOUBLE) {
|
||||
return ComputeKernal<double>(ctx);
|
||||
} else {
|
||||
return ComputeKernal<Eigen::half>(ctx);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SparseSegmentSqrtNGradCpuKernel::ComputeKernal(CpuKernelContext &ctx) {
|
||||
size_t n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
size_t m = ctx.Input(2)->GetTensorShape()->NumElements();
|
||||
int l = ctx.Output(0)->GetTensorShape()->GetDimSize(0);
|
||||
auto x_addr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto indices_addr = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
|
||||
auto segment_ids_addr = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
|
||||
int k = *reinterpret_cast<int32_t *>(ctx.Input(3)->GetData());
|
||||
auto y_addr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
|
||||
std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
y_shape_values[0] = k;
|
||||
ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
|
||||
|
||||
const size_t tensor_dim = 2;
|
||||
Eigen::TensorMap<Eigen::Tensor<T, tensor_dim>, Eigen::Aligned> res_map(y_addr, l, n);
|
||||
res_map.setZero();
|
||||
|
||||
for (size_t i = 1; i < m; i++) {
|
||||
if (segment_ids_addr[i] < segment_ids_addr[i - 1]) {
|
||||
KERNEL_LOG_ERROR("Segment_ids should be sorted.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < m; i++) {
|
||||
if (indices_addr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
|
||||
KERNEL_LOG_ERROR("Indices out of range.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (segment_ids_addr[i] >= k) {
|
||||
KERNEL_LOG_ERROR("Segment_ids out of range.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int beginindex = segment_ids_addr[0];
|
||||
size_t countnum = 1;
|
||||
for (size_t i = 1; i < m; i++) {
|
||||
if (segment_ids_addr[i] == beginindex) {
|
||||
countnum++;
|
||||
continue;
|
||||
}
|
||||
for (size_t j = 1; j <= countnum; j++) {
|
||||
for (size_t l = 0; l < n; l++) {
|
||||
y_addr[indices_addr[i - j] * n + l] += x_addr[beginindex * n + l] / (T)(sqrt(countnum));
|
||||
}
|
||||
beginindex = segment_ids_addr[i];
|
||||
countnum = 1;
|
||||
}
|
||||
}
|
||||
|
||||
int i = m;
|
||||
for (size_t j = 1; j <= countnum; j++) {
|
||||
for (size_t l = 0; l < n; l++) {
|
||||
y_addr[indices_addr[i - j] * n + l] += x_addr[beginindex * n + l] / (T)(sqrt(countnum));
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
};
|
||||
|
||||
REGISTER_CPU_KERNEL(SparseSegmentSqrtNGrad, SparseSegmentSqrtNGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_GRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SparseSegmentSqrtNGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SparseSegmentSqrtNGradCpuKernel() = default;
|
||||
~SparseSegmentSqrtNGradCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static uint32_t ComputeKernal(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,186 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparse_segment_sqrt_n_with_num_segments.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 4;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *SparseSegmentSqrtNWithNumSegments = "SparseSegmentSqrtNWithNumSegments";
|
||||
|
||||
#define COMPUTE_CASE(DTYPE, TYPE, DTYPE_1, DTYPE_2, DTYPE_3, CTX) \
|
||||
case (DTYPE): \
|
||||
if ((DTYPE_1) == DT_INT32) { \
|
||||
if ((DTYPE_2) == DT_INT32) { \
|
||||
if ((DTYPE_3) == DT_INT32) { \
|
||||
return Computekernel<TYPE, int32_t, int32_t, int32_t>(CTX); \
|
||||
} else { \
|
||||
return Computekernel<TYPE, int32_t, int32_t, int64_t>(CTX); \
|
||||
} \
|
||||
} else { \
|
||||
if ((DTYPE_3) == DT_INT32) { \
|
||||
return Computekernel<TYPE, int32_t, int64_t, int32_t>(CTX); \
|
||||
} else { \
|
||||
return Computekernel<TYPE, int32_t, int64_t, int64_t>(CTX); \
|
||||
} \
|
||||
} \
|
||||
} else { \
|
||||
if ((DTYPE_2) == DT_INT32) { \
|
||||
if ((DTYPE_3) == DT_INT32) { \
|
||||
return Computekernel<TYPE, int64_t, int32_t, int32_t>(CTX); \
|
||||
} else { \
|
||||
return Computekernel<TYPE, int64_t, int32_t, int64_t>(CTX); \
|
||||
} \
|
||||
} else { \
|
||||
if ((DTYPE_3) == DT_INT32) { \
|
||||
return Computekernel<TYPE, int64_t, int64_t, int32_t>(CTX); \
|
||||
} else { \
|
||||
return Computekernel<TYPE, int64_t, int64_t, int64_t>(CTX); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
break;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SparseSegmentSqrtNWithNumSegmentsCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentSqrtNWithNumSegments normalcheck failed.");
|
||||
Tensor *x = ctx.Input(0);
|
||||
Tensor *indices = ctx.Input(1);
|
||||
Tensor *segment_ids = ctx.Input(2);
|
||||
Tensor *num_segments = ctx.Input(3);
|
||||
|
||||
auto x_shape = x->GetTensorShape();
|
||||
auto indices_shape = indices->GetTensorShape();
|
||||
auto segment_ids_shape = segment_ids->GetTensorShape();
|
||||
auto num_segments_shape = num_segments->GetTensorShape();
|
||||
|
||||
if (x_shape->GetDims() < 1) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor x's rank less than 1.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (indices->NumElements() != segment_ids->NumElements()) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor indices&segment_ids's ranks mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
auto x_data_type = x->GetDataType();
|
||||
auto indices_data_type = indices->GetDataType();
|
||||
auto segment_ids_data_type = segment_ids->GetDataType();
|
||||
auto num_segments_data_type = num_segments->GetDataType();
|
||||
|
||||
if (indices_data_type != DT_INT32 && indices_data_type != DT_INT64) {
|
||||
KERNEL_LOG_ERROR("SparseSegmentSqrtNWithNumSegments kernel data type [%s] not support.",
|
||||
DTypeStr(indices_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (segment_ids_data_type != DT_INT32 && segment_ids_data_type != DT_INT64) {
|
||||
KERNEL_LOG_ERROR("SparseSegmentSqrtNWithNumSegments kernel data type [%s] not support.",
|
||||
DTypeStr(segment_ids_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (num_segments_data_type != DT_INT32 && num_segments_data_type != DT_INT64) {
|
||||
KERNEL_LOG_ERROR("SparseSegmentSqrtNWithNumSegments kernel data type [%s] not support.",
|
||||
DTypeStr(num_segments_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
switch (x_data_type) {
|
||||
COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
|
||||
COMPUTE_CASE(DT_FLOAT, float, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
|
||||
COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR(
|
||||
"SparseSegmentSqrtNWithNumSegments kernel data type [%s] not "
|
||||
"support.",
|
||||
DTypeStr(x_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(SparseSegmentSqrtNWithNumSegments, SparseSegmentSqrtNWithNumSegmentsCpuKernel);
|
||||
|
||||
template <typename T1, typename T2, typename T3, typename T4>
|
||||
uint32_t SparseSegmentSqrtNWithNumSegmentsCpuKernel::Computekernel(CpuKernelContext &ctx) {
|
||||
int n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
int m = ctx.Input(2)->GetTensorShape()->NumElements();
|
||||
auto x_ptr = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
|
||||
auto indices_ptr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
|
||||
auto segment_ids_ptr = reinterpret_cast<T3 *>(ctx.Input(2)->GetData());
|
||||
auto num_segments_ptr = reinterpret_cast<T4 *>(ctx.Input(3)->GetData());
|
||||
auto y_ptr = reinterpret_cast<T1 *>(ctx.Output(0)->GetData());
|
||||
|
||||
std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
y_shape_values[0] = num_segments_ptr[0];
|
||||
ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
|
||||
|
||||
for (int64_t i = 1; i < m; i++) {
|
||||
if (segment_ids_ptr[i] < segment_ids_ptr[i - 1]) {
|
||||
KERNEL_LOG_ERROR("segment_ids should be sorted.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < m; i++) {
|
||||
if (indices_ptr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
|
||||
KERNEL_LOG_ERROR("indices out of range.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (segment_ids_ptr[i] >= num_segments_ptr[0]) {
|
||||
KERNEL_LOG_ERROR("segment_ids out of range.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < ctx.Output(0)->GetTensorShape()->NumElements(); i++) {
|
||||
y_ptr[i] = (T1)0;
|
||||
}
|
||||
|
||||
int oldindex = -1;
|
||||
int countnum = 0;
|
||||
for (int64_t i = 0; i < m; i++) {
|
||||
if (oldindex == segment_ids_ptr[i]) {
|
||||
countnum++;
|
||||
} else if (countnum != 0) {
|
||||
for (int64_t j = 0; j < n; j++) {
|
||||
y_ptr[j + oldindex * n] /= (static_cast<T1>(sqrt(countnum)));
|
||||
}
|
||||
countnum = 1;
|
||||
oldindex = segment_ids_ptr[i];
|
||||
} else {
|
||||
countnum = 1;
|
||||
oldindex = segment_ids_ptr[i];
|
||||
}
|
||||
for (int64_t j = 0; j < n; j++) {
|
||||
y_ptr[j + oldindex * n] += x_ptr[j + indices_ptr[i] * n];
|
||||
}
|
||||
}
|
||||
if (countnum != 0) {
|
||||
for (int64_t j = 0; j < n; j++) {
|
||||
y_ptr[j + oldindex * n] /= (static_cast<T1>(sqrt(countnum)));
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_WITH_NUM_SEGMENTS_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_WITH_NUM_SEGMENTS_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
#include "utils/sparse_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SparseSegmentSqrtNWithNumSegmentsCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SparseSegmentSqrtNWithNumSegmentsCpuKernel() = default;
|
||||
~SparseSegmentSqrtNWithNumSegmentsCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T1, typename T2, typename T3, typename T4>
|
||||
uint32_t Computekernel(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,152 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "sparse_segment_sum_with_num_segments.h"
|
||||
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 4;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *SparseSegmentSumWithNumSegments = "SparseSegmentSumWithNumSegments";
|
||||
#define COMPUTE_CASE(DTYPE, TYPE, ITYPE, CTX) \
|
||||
case (DTYPE): \
|
||||
if ((ITYPE) == DT_INT32) { \
|
||||
return ComputeKernel<TYPE, int32_t>(CTX); \
|
||||
} else { \
|
||||
return ComputeKernel<TYPE, int64_t>(CTX); \
|
||||
} \
|
||||
break;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SparseSegmentSumWithNumSegmentsCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentSumWithNumSegments normalcheck failed.");
|
||||
Tensor *x = ctx.Input(0);
|
||||
Tensor *indices = ctx.Input(1);
|
||||
Tensor *segment_ids = ctx.Input(2);
|
||||
Tensor *num_segments = ctx.Input(3);
|
||||
|
||||
if (x->GetDataSize() == 0 || indices->GetDataSize() == 0 || segment_ids->GetDataSize() == 0 ||
|
||||
num_segments->GetDataSize() == 0) {
|
||||
KERNEL_LOG_ERROR("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
auto x_shape = x->GetTensorShape();
|
||||
auto indices_shape = indices->GetTensorShape();
|
||||
auto segment_ids_shape = segment_ids->GetTensorShape();
|
||||
auto num_segments_shape = num_segments->GetTensorShape();
|
||||
|
||||
if (x_shape->GetDims() < 1) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor x's rank less than 1.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (indices_shape->NumElements() != segment_ids_shape->NumElements()) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor indices&segment_ids's ranks mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
auto x_data_type = x->GetDataType();
|
||||
auto indices_data_type = indices->GetDataType();
|
||||
auto segment_ids_data_type = segment_ids->GetDataType();
|
||||
auto num_segments_data_type = num_segments->GetDataType();
|
||||
|
||||
if (indices_data_type != DT_INT32 && indices_data_type != DT_INT64) {
|
||||
KERNEL_LOG_ERROR("SparseSegmentSumWithNumSegments kernel data type [%s] not support.",
|
||||
DTypeStr(indices_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (segment_ids_data_type != indices_data_type || num_segments_data_type != indices_data_type) {
|
||||
KERNEL_LOG_ERROR("SparseSegmentSumWithNumSegments kernel data type mismatch.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
switch (x_data_type) {
|
||||
COMPUTE_CASE(DT_INT8, int8_t, indices_data_type, ctx)
|
||||
COMPUTE_CASE(DT_INT16, int16_t, indices_data_type, ctx)
|
||||
COMPUTE_CASE(DT_INT32, int32_t, indices_data_type, ctx)
|
||||
COMPUTE_CASE(DT_INT64, int64_t, indices_data_type, ctx)
|
||||
COMPUTE_CASE(DT_UINT8, uint8_t, indices_data_type, ctx)
|
||||
COMPUTE_CASE(DT_UINT16, uint16_t, indices_data_type, ctx)
|
||||
COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, ctx)
|
||||
COMPUTE_CASE(DT_FLOAT, float, indices_data_type, ctx)
|
||||
COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("SparseSegmentSumWithNumSegments kernel data type [%s] not support.",
|
||||
DTypeStr(x_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(SparseSegmentSumWithNumSegments, SparseSegmentSumWithNumSegmentsCpuKernel);
|
||||
|
||||
template <typename dataT, typename indicesT>
|
||||
uint32_t SparseSegmentSumWithNumSegmentsCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
|
||||
size_t n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
size_t m = ctx.Input(2)->GetTensorShape()->NumElements();
|
||||
size_t num_elements = ctx.Output(0)->GetTensorShape()->NumElements();
|
||||
auto x_ptr = reinterpret_cast<dataT *>(ctx.Input(0)->GetData());
|
||||
auto indices_ptr = reinterpret_cast<indicesT *>(ctx.Input(1)->GetData());
|
||||
auto segment_ids_ptr = reinterpret_cast<indicesT *>(ctx.Input(2)->GetData());
|
||||
auto num_segments_ptr = reinterpret_cast<indicesT *>(ctx.Input(3)->GetData());
|
||||
auto y_ptr = reinterpret_cast<dataT *>(ctx.Output(0)->GetData());
|
||||
|
||||
std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
y_shape_values[0] = num_segments_ptr[0];
|
||||
ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
|
||||
|
||||
for (size_t i = 1; i < m; i++) {
|
||||
if (segment_ids_ptr[i] < segment_ids_ptr[i - 1]) {
|
||||
KERNEL_LOG_ERROR("segment_ids should be sorted.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < m; i++) {
|
||||
if (indices_ptr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
|
||||
KERNEL_LOG_ERROR("indices out of range.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (segment_ids_ptr[i] >= num_segments_ptr[0]) {
|
||||
KERNEL_LOG_ERROR("segment_ids out of range.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_elements; i++) {
|
||||
y_ptr[i] = (dataT)0;
|
||||
}
|
||||
|
||||
int oldindex = -1;
|
||||
for (size_t i = 0; i < m; i++) {
|
||||
if (oldindex != segment_ids_ptr[i]) {
|
||||
oldindex = segment_ids_ptr[i];
|
||||
for (size_t j = 0; j < n; j++) {
|
||||
y_ptr[j + oldindex * n] = (dataT)0;
|
||||
}
|
||||
}
|
||||
for (size_t j = 0; j < n; j++) {
|
||||
y_ptr[j + oldindex * n] += x_ptr[j + indices_ptr[i] * n];
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
};
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SUM_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SUM_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SparseSegmentSumWithNumSegmentsCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SparseSegmentSumWithNumSegmentsCpuKernel() = default;
|
||||
~SparseSegmentSumWithNumSegmentsCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename dataT, typename indicesT>
|
||||
static uint32_t ComputeKernel(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,253 @@
|
|||
/**
|
||||
Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparse_softmax_cross_entropy_with_logits.h"
|
||||
#include <iostream>
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
namespace {
|
||||
const char *kSparseSoftmaxCrossEntropyWithLogits = "SparseSoftmaxCrossEntropyWithLogits";
|
||||
const uint32_t kOutputNum{2};
|
||||
const uint32_t kInputNum{2};
|
||||
const uint32_t kDimSizeTwo{2};
|
||||
const uint32_t kDimSizeOne{1};
|
||||
const uint32_t paralledDataNum{2048};
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename data_type, typename label_type>
|
||||
void SparseSoftmaxCrossEntropyWithLogitsSingleOp(data_type *input_features, label_type *input_labels,
|
||||
data_type *output_loss, data_type *output_backprop, int64_t batch_size,
|
||||
int64_t classes_num, size_t features_total) {
|
||||
double_t *dims_exp_sum = static_cast<double_t *>(malloc(batch_size * sizeof(double_t)));
|
||||
data_type *dims_maximum = static_cast<data_type *>(malloc(batch_size * sizeof(data_type)));
|
||||
memset(dims_exp_sum, 0, batch_size * sizeof(double_t));
|
||||
Eigen::TensorMap<Eigen::Tensor<data_type, kDimSizeTwo>, Eigen::Aligned> logits(input_features, batch_size,
|
||||
classes_num);
|
||||
Eigen::TensorMap<Eigen::Tensor<double_t, 1>, Eigen::Aligned> dims_sum(dims_exp_sum, batch_size);
|
||||
Eigen::TensorMap<Eigen::Tensor<data_type, 1>, Eigen::Aligned> dims_max(dims_maximum, batch_size);
|
||||
Eigen::array<int, 1> axes{{1}};
|
||||
// compute softmax
|
||||
dims_max = logits.maximum(axes);
|
||||
const data_type constant_one(1.0);
|
||||
for (size_t index = 0, batch_idx = 0; index < features_total; index++) {
|
||||
output_backprop[index] = Eigen::numext::exp(input_features[index] - dims_maximum[batch_idx]);
|
||||
dims_exp_sum[batch_idx] += static_cast<double_t>(output_backprop[index]);
|
||||
if ((index + 1) % classes_num == 0) {
|
||||
batch_idx++;
|
||||
}
|
||||
}
|
||||
dims_sum = dims_sum.inverse();
|
||||
for (size_t index = 0, batch_idx = 0; index < features_total; index++) {
|
||||
*(output_backprop + index) =
|
||||
static_cast<data_type>(static_cast<double_t>(*(output_backprop + index)) * dims_exp_sum[batch_idx]);
|
||||
if ((index + 1) % classes_num == 0) {
|
||||
batch_idx++;
|
||||
}
|
||||
}
|
||||
label_type offset = 0;
|
||||
for (int64_t index = 0, batch_base = 0; index < batch_size; ++index, batch_base += classes_num) {
|
||||
offset = input_labels[index];
|
||||
*(output_loss + index) = -Eigen::numext::log(*(output_backprop + batch_base + offset));
|
||||
*(output_backprop + batch_base + offset) = *(output_backprop + batch_base + offset) - constant_one;
|
||||
}
|
||||
free(dims_exp_sum);
|
||||
free(dims_maximum);
|
||||
}
|
||||
|
||||
template <typename data_type, typename label_type>
|
||||
void SparseSoftmaxCrossEntropyWithLogitsMultiOp(data_type *input_features, label_type *input_labels,
|
||||
data_type *output_loss, data_type *output_backprop, size_t begin,
|
||||
size_t end, int64_t classes_num, size_t features_total) {
|
||||
for (size_t index = begin; index < end; index++) {
|
||||
size_t batch_begin = index * classes_num;
|
||||
size_t batch_end = batch_begin + classes_num;
|
||||
data_type max_value = input_features[batch_begin];
|
||||
double_t sum_value{0};
|
||||
data_type constant_one{1};
|
||||
for (size_t idx = batch_begin; idx < batch_end; idx++) {
|
||||
if (max_value < input_features[idx]) {
|
||||
max_value = input_features[idx];
|
||||
}
|
||||
}
|
||||
for (size_t idx = batch_begin; idx < batch_end; idx++) {
|
||||
output_backprop[idx] = Eigen::numext::exp(input_features[idx] - max_value);
|
||||
sum_value += static_cast<double_t>(output_backprop[idx]);
|
||||
}
|
||||
sum_value = double_t(1.0) / sum_value;
|
||||
for (size_t idx = batch_begin; idx < batch_end; idx++) {
|
||||
output_backprop[idx] = static_cast<data_type>(static_cast<double_t>(output_backprop[idx]) * sum_value);
|
||||
if (idx % classes_num == static_cast<size_t>(input_labels[index])) {
|
||||
output_loss[index] = -Eigen::numext::log(output_backprop[idx]);
|
||||
output_backprop[idx] = output_backprop[idx] - constant_one;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::uint32_t SparseSoftmaxCrossEntropyWithLogitsExtraCheck(CpuKernelContext &ctx) {
|
||||
Tensor *input_features = ctx.Input(0);
|
||||
Tensor *input_labels = ctx.Input(1);
|
||||
Tensor *output_loss = ctx.Output(0);
|
||||
Tensor *output_backprop = ctx.Output(1);
|
||||
std::vector<int64_t> features_dims = input_features->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> labels_dims = input_labels->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> loss_dims = output_loss->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> backprop_dims = output_backprop->GetTensorShape()->GetDimSizes();
|
||||
if ((input_features->GetDataSize() == 0) || (input_labels->GetDataSize() == 0)) {
|
||||
KERNEL_LOG_INFO("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (input_features->GetDataType() != output_loss->GetDataType() ||
|
||||
input_features->GetDataType() != output_backprop->GetDataType()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data type of the input features [%s], output loss [%s], output "
|
||||
"backprop [%s] must be the same type.",
|
||||
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str(),
|
||||
DTypeStr(ctx.Output(1)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (input_labels->GetDataType() != DT_INT32 && input_labels->GetDataType() != DT_INT64) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data type of the input labels [%s], must be the int32 or int64 "
|
||||
"type.",
|
||||
DTypeStr(ctx.Input(1)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (features_dims.size() != kDimSizeTwo || labels_dims.size() != kDimSizeOne || loss_dims.size() != kDimSizeOne ||
|
||||
backprop_dims.size() != kDimSizeTwo) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The dims of the input features [%d], output backprop [%d] must be "
|
||||
"[batch_size x num_classes]. the dims of input labels [%d], output "
|
||||
"loss [%d] must be [batch_size].",
|
||||
features_dims.size(), backprop_dims.size(), labels_dims.size(), loss_dims.size());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
int64_t batch_size = features_dims[0];
|
||||
int64_t num_classes = features_dims[1];
|
||||
if (labels_dims[0] != batch_size) {
|
||||
KERNEL_LOG_ERROR("the size of label must be equal with batch_size[%d]", batch_size);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (loss_dims[0] != batch_size) {
|
||||
KERNEL_LOG_ERROR("the size of loss must be equal with batch_size[%d]", batch_size);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (backprop_dims[0] != batch_size || backprop_dims[1] != num_classes) {
|
||||
KERNEL_LOG_ERROR("the size of label must be equal with [%d x %d], but get [%d x %d]", batch_size, num_classes,
|
||||
backprop_dims[0], backprop_dims[1]);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
template <typename data_type, typename label_type>
|
||||
inline uint32_t SparseSoftmaxCrossEntropyWithLogitsCompute(const CpuKernelContext &ctx) {
|
||||
size_t features_total = static_cast<size_t>(ctx.Input(0)->NumElements());
|
||||
uint64_t total_size = ctx.Input(0)->GetDataSize();
|
||||
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
|
||||
auto *input_features = static_cast<data_type *>(ctx.Input(0)->GetData());
|
||||
auto *input_labels = static_cast<label_type *>(ctx.Input(1)->GetData());
|
||||
auto *output_loss = static_cast<data_type *>(ctx.Output(0)->GetData());
|
||||
auto *output_backprop = static_cast<data_type *>(ctx.Output(1)->GetData());
|
||||
bool muilt_core_flag = false;
|
||||
if (total_size > paralledDataNum * sizeof(data_type)) {
|
||||
muilt_core_flag = true;
|
||||
}
|
||||
std::vector<std::int64_t> dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<std::int64_t> labels_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
for (int64_t idx = 0; idx < labels_dims[0]; idx++) {
|
||||
if (input_labels[idx] >= dims[1]) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"Received a label value of [%d] which is outside the valid range of "
|
||||
"[0, %d).",
|
||||
input_labels[idx], dims[1]);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
// Determine whether to enable multi-core parallel computing
|
||||
size_t pivot, classes_num;
|
||||
int64_t batch_size{1};
|
||||
pivot = dims.size() - 1;
|
||||
classes_num = dims[pivot];
|
||||
for (size_t index = 0; index < dims.size(); index++) {
|
||||
if (index < pivot) {
|
||||
batch_size *= dims[index];
|
||||
}
|
||||
}
|
||||
// Eigen::Array
|
||||
if (muilt_core_flag) {
|
||||
std::int64_t per_unit_size{batch_size / std::min(std::max(1L, cores - 2L), batch_size)};
|
||||
auto shard = [&](size_t begin, size_t end) {
|
||||
SparseSoftmaxCrossEntropyWithLogitsMultiOp(input_features, input_labels, output_loss, output_backprop, begin, end,
|
||||
classes_num, features_total);
|
||||
};
|
||||
CpuKernelUtils::ParallelFor(ctx, batch_size, per_unit_size, shard);
|
||||
} else if (cores != 0) {
|
||||
SparseSoftmaxCrossEntropyWithLogitsSingleOp<data_type, label_type>(
|
||||
input_features, input_labels, output_loss, output_backprop, batch_size, classes_num, features_total);
|
||||
} else {
|
||||
KERNEL_LOG_ERROR("SparseSoftmaxCrossEntropyWithLogits compute failed.");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SparseSoftmaxCrossEntropyWithLogitsCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
if (NormalCheck(ctx, kInputNum, kOutputNum) == KERNEL_STATUS_PARAM_INVALID) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (SparseSoftmaxCrossEntropyWithLogitsExtraCheck(ctx) == KERNEL_STATUS_PARAM_INVALID) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
// choose compute function depend on dataType
|
||||
auto data_type = static_cast<DataType>(ctx.Input(0)->GetDataType());
|
||||
auto labels_type = static_cast<DataType>(ctx.Input(1)->GetDataType());
|
||||
switch (data_type) {
|
||||
case DT_FLOAT16: {
|
||||
if (labels_type == DT_INT32) {
|
||||
return SparseSoftmaxCrossEntropyWithLogitsCompute<Eigen::half, std::int32_t>(ctx);
|
||||
} else if (labels_type == DT_INT64) {
|
||||
return SparseSoftmaxCrossEntropyWithLogitsCompute<Eigen::half, std::int64_t>(ctx);
|
||||
}
|
||||
}
|
||||
case DT_FLOAT: {
|
||||
if (labels_type == DT_INT32) {
|
||||
return SparseSoftmaxCrossEntropyWithLogitsCompute<std::float_t, std::int32_t>(ctx);
|
||||
} else if (labels_type == DT_INT64) {
|
||||
return SparseSoftmaxCrossEntropyWithLogitsCompute<std::float_t, std::int64_t>(ctx);
|
||||
}
|
||||
}
|
||||
case DT_DOUBLE: {
|
||||
if (labels_type == DT_INT32) {
|
||||
return SparseSoftmaxCrossEntropyWithLogitsCompute<std::double_t, std::int32_t>(ctx);
|
||||
} else if (labels_type == DT_INT64) {
|
||||
return SparseSoftmaxCrossEntropyWithLogitsCompute<std::double_t, std::int64_t>(ctx);
|
||||
}
|
||||
}
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSparseSoftmaxCrossEntropyWithLogits, SparseSoftmaxCrossEntropyWithLogitsCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,27 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSESOFTMAXENTROPYWITHLOGITS_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSESOFTMAXENTROPYWITHLOGITS_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SparseSoftmaxCrossEntropyWithLogitsCpuKernel final : public CpuKernel {
|
||||
virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,241 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparse_sparse_maximum.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kSparseSparseMaximum = "SparseSparseMaximum";
|
||||
const uint32_t kOutputNum = 2;
|
||||
const uint32_t kInputNum = 6;
|
||||
constexpr int64_t kIndex0 = 0;
|
||||
constexpr int64_t kIndex1 = 1;
|
||||
constexpr int64_t kIndex2 = 2;
|
||||
constexpr int64_t kIndex3 = 3;
|
||||
constexpr int64_t kIndex4 = 4;
|
||||
constexpr int64_t kIndex5 = 5;
|
||||
bool isMatrix(const std::shared_ptr<aicpu::TensorShape> shape) { return shape->GetDims() == 2; }
|
||||
bool isVector(const std::shared_ptr<aicpu::TensorShape> shape) { return shape->GetDims() == 1; }
|
||||
} // namespace
|
||||
// 定义命名空间aicpu
|
||||
namespace aicpu {
|
||||
uint32_t SparseMaximumCpuKernel::NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &databank) {
|
||||
databank.a_indices_t = ctx.Input(kIndex0);
|
||||
databank.a_values_t = ctx.Input(kIndex1);
|
||||
databank.a_shape_t = ctx.Input(kIndex2);
|
||||
databank.b_indices_t = ctx.Input(kIndex3);
|
||||
databank.b_values_t = ctx.Input(kIndex4);
|
||||
databank.b_shape_t = ctx.Input(kIndex5);
|
||||
databank.output_indices_t = ctx.Output(kIndex0);
|
||||
databank.output_values_t = ctx.Output(kIndex1);
|
||||
KERNEL_CHECK_FALSE(
|
||||
isMatrix(databank.a_indices_t->GetTensorShape()) && isMatrix(databank.b_indices_t->GetTensorShape()),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Inputs a_indices and b_indices should be "
|
||||
"matrices but received shapes: [%d], [%d]",
|
||||
databank.a_indices_t->GetTensorShape()->GetDims(), databank.b_indices_t->GetTensorShape()->GetDims());
|
||||
KERNEL_CHECK_FALSE(isVector(databank.a_values_t->GetTensorShape()) && isVector(databank.b_values_t->GetTensorShape()),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Inputs a_values and b_values should be vectors "
|
||||
"but received shapes: [%d] and [%d]",
|
||||
databank.a_values_t->GetTensorShape()->GetDims(),
|
||||
databank.b_values_t->GetTensorShape()->GetDims());
|
||||
KERNEL_CHECK_FALSE(isVector(databank.a_shape_t->GetTensorShape()) && isVector(databank.b_shape_t->GetTensorShape()),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Input shapes should be a vector but received shapes [%d] and [%d]",
|
||||
databank.a_shape_t->GetTensorShape()->GetDims(), databank.b_shape_t->GetTensorShape()->GetDims());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
inline static int64_t cmp(const TTypes<int64_t>::Matrix &a_idx, const TTypes<int64_t>::Matrix &b_idx,
|
||||
const int64_t a_row, const int64_t b_row, const int64_t dims) {
|
||||
for (int d = 0; d < dims; ++d) {
|
||||
const int64_t a = a_idx(a_row, d);
|
||||
const int64_t b = b_idx(b_row, d);
|
||||
if (a < b) {
|
||||
return -1;
|
||||
} else if (a > b) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SparseMaximumCpuKernel::UnionSparseIndicesAndValues(typename TTypes<int64_t>::Matrix a_indices_mat,
|
||||
typename TTypes<T>::Flat a_values, int64_t a_nnz,
|
||||
typename TTypes<int64_t>::Matrix b_indices_mat,
|
||||
typename TTypes<T>::Flat b_values, int64_t b_nnz,
|
||||
int64_t num_dims, std::vector<T> *a_augmented_values,
|
||||
std::vector<T> *b_augmented_values,
|
||||
std::vector<std::pair<bool, int64_t>> *entries_to_copy) {
|
||||
entries_to_copy->reserve(a_nnz + b_nnz);
|
||||
a_augmented_values->reserve(a_nnz);
|
||||
b_augmented_values->reserve(b_nnz);
|
||||
|
||||
int64_t i = 0, j = 0;
|
||||
const T kZero = T(0);
|
||||
while (i < a_nnz && j < b_nnz) {
|
||||
switch (cmp(a_indices_mat, b_indices_mat, i, j, num_dims)) {
|
||||
case -1:
|
||||
entries_to_copy->emplace_back(true, i);
|
||||
a_augmented_values->push_back(a_values(i));
|
||||
b_augmented_values->push_back(kZero);
|
||||
++i;
|
||||
break;
|
||||
case 0:
|
||||
entries_to_copy->emplace_back(true, i);
|
||||
a_augmented_values->push_back(a_values(i));
|
||||
b_augmented_values->push_back(b_values(j));
|
||||
++i;
|
||||
++j;
|
||||
break;
|
||||
case 1:
|
||||
entries_to_copy->emplace_back(false, j);
|
||||
a_augmented_values->push_back(kZero);
|
||||
b_augmented_values->push_back(b_values(j));
|
||||
++j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Handles leftovers; at most one loop runs.
|
||||
while (i < a_nnz) {
|
||||
entries_to_copy->emplace_back(true, i);
|
||||
a_augmented_values->push_back(a_values(i++));
|
||||
b_augmented_values->push_back(kZero);
|
||||
}
|
||||
while (j < b_nnz) {
|
||||
entries_to_copy->emplace_back(false, j);
|
||||
a_augmented_values->push_back(kZero);
|
||||
b_augmented_values->push_back(b_values(j++));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SparseMaximumCpuKernel::EigenedSparseMax(DataBank &databank) {
|
||||
const int64_t a_nnz = databank.a_indices_t->GetTensorShape()->GetDimSize(0);
|
||||
const int64_t b_nnz = databank.b_indices_t->GetTensorShape()->GetDimSize(0);
|
||||
EigenTensor a_values_t(databank.a_values_t, databank.a_values_t->GetData());
|
||||
const auto a_values = a_values_t.vec<T>();
|
||||
EigenTensor b_values_t(databank.b_values_t, databank.b_values_t->GetData());
|
||||
const auto b_values = b_values_t.vec<T>();
|
||||
|
||||
EigenTensor a_indices_t(databank.a_indices_t, databank.a_indices_t->GetData());
|
||||
const auto a_indices_mat = a_indices_t.matrix<int64_t>();
|
||||
EigenTensor b_indices_t(databank.b_indices_t, databank.b_indices_t->GetData());
|
||||
const auto b_indices_mat = b_indices_t.matrix<int64_t>();
|
||||
|
||||
const int64_t num_dims = databank.a_indices_t->GetTensorShape()->GetDimSize(1);
|
||||
EigenTensor a_shape_t(databank.a_shape_t, databank.a_shape_t->GetData());
|
||||
const auto a_shape = a_shape_t.flat<int64_t>();
|
||||
EigenTensor b_shape_t(databank.b_shape_t, databank.b_shape_t->GetData());
|
||||
const auto b_shape = b_shape_t.flat<int64_t>();
|
||||
|
||||
KERNEL_CHECK_FALSE(a_values.size() == a_nnz && b_values.size() == b_nnz, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected [%d] and [%d] non-empty input values, got [%d] and [%d]", a_nnz, b_nnz, a_values.size(),
|
||||
b_values.size());
|
||||
KERNEL_CHECK_FALSE(databank.a_shape_t->GetTensorShape()->NumElements() == num_dims, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Second dimension of a_indices and length of "
|
||||
"a_shape must match, got [%d] and [%d]",
|
||||
databank.a_shape_t->GetTensorShape()->NumElements(), num_dims);
|
||||
KERNEL_CHECK_FALSE(num_dims > 0, KERNEL_STATUS_PARAM_INVALID, "Tensors must not be empty");
|
||||
KERNEL_CHECK_FALSE(
|
||||
databank.a_shape_t->GetTensorShape()->NumElements() == databank.b_shape_t->GetTensorShape()->NumElements(),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Operands do not have the same ranks; got shapes: [%d] and [%d]",
|
||||
databank.a_shape_t->GetTensorShape()->NumElements(), databank.b_shape_t->GetTensorShape()->NumElements());
|
||||
|
||||
for (int i = 0; i < num_dims; ++i) {
|
||||
KERNEL_CHECK_FALSE(a_shape(i) == b_shape(i), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Operands' shapes do not match: got [%d] and [%d] for dimension [%d]", a_shape(i), b_shape(i), i)
|
||||
}
|
||||
|
||||
std::vector<T> a_augmented_values, b_augmented_values;
|
||||
std::vector<std::pair<bool, int64_t>> entries_to_copy; // from_a?, idx
|
||||
UnionSparseIndicesAndValues(a_indices_mat, a_values, a_nnz, b_indices_mat, b_values, b_nnz, num_dims,
|
||||
&a_augmented_values, &b_augmented_values, &entries_to_copy);
|
||||
|
||||
const int64_t sum_nnz = a_augmented_values.size();
|
||||
EigenTensor output_values_t(databank.output_values_t, databank.output_values_t->GetData());
|
||||
EigenTensor output_indices_t(databank.output_indices_t, databank.output_indices_t->GetData());
|
||||
auto output_indices_mat = output_indices_t.matrix<int64_t>();
|
||||
for (int64_t i = 0; i < sum_nnz; ++i) {
|
||||
const bool from_a = entries_to_copy[i].first;
|
||||
const int64_t idx = entries_to_copy[i].second;
|
||||
output_indices_mat.chip<0>(i) = from_a ? a_indices_mat.chip<0>(idx) : b_indices_mat.chip<0>(idx);
|
||||
}
|
||||
|
||||
using UnalignedTensorMap = Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>, Eigen::Unaligned>;
|
||||
auto a_augmented_values_t = UnalignedTensorMap(a_augmented_values.data(), sum_nnz);
|
||||
auto b_augmented_values_t = UnalignedTensorMap(b_augmented_values.data(), sum_nnz);
|
||||
|
||||
output_values_t.flat<T>() =
|
||||
a_augmented_values_t.binaryExpr(b_augmented_values_t, Eigen::internal::scalar_max_op<T, T>());
|
||||
databank.output_indices_t->GetTensorShape()->SetDimSizes({sum_nnz, num_dims});
|
||||
databank.output_values_t->GetTensorShape()->SetDimSizes({sum_nnz});
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SparseMaximumCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"SparseSparseMaximum check input and output number failed.");
|
||||
|
||||
DataBank databank;
|
||||
KERNEL_HANDLE_ERROR(NullptrAndMatVecCheck(ctx, databank), "SparseSparseMaximum check params failed.");
|
||||
|
||||
DataType dt = static_cast<DataType>(databank.output_values_t->GetDataType());
|
||||
uint32_t KERNEL_STATUS;
|
||||
switch (dt) {
|
||||
case DT_INT8:
|
||||
KERNEL_STATUS = EigenedSparseMax<int8_t>(databank);
|
||||
break;
|
||||
case DT_UINT8:
|
||||
KERNEL_STATUS = EigenedSparseMax<uint8_t>(databank);
|
||||
break;
|
||||
case DT_INT16:
|
||||
KERNEL_STATUS = EigenedSparseMax<int16_t>(databank);
|
||||
break;
|
||||
case DT_UINT16:
|
||||
KERNEL_STATUS = EigenedSparseMax<uint16_t>(databank);
|
||||
break;
|
||||
case DT_INT32:
|
||||
KERNEL_STATUS = EigenedSparseMax<int32_t>(databank);
|
||||
break;
|
||||
case DT_INT64:
|
||||
KERNEL_STATUS = EigenedSparseMax<int64_t>(databank);
|
||||
break;
|
||||
case DT_FLOAT16:
|
||||
KERNEL_STATUS = EigenedSparseMax<Eigen::half>(databank);
|
||||
break;
|
||||
case DT_FLOAT:
|
||||
KERNEL_STATUS = EigenedSparseMax<float>(databank);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
KERNEL_STATUS = EigenedSparseMax<double>(databank);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("SparseSparseMaximum can't support this data type [%d].", dt);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (KERNEL_STATUS != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("SparseSparseMaximum failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
// 注册该算子实现
|
||||
REGISTER_CPU_KERNEL(kSparseSparseMaximum, SparseMaximumCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,59 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
struct DataBank {
|
||||
DataBank()
|
||||
: a_indices_t(nullptr),
|
||||
a_values_t(nullptr),
|
||||
a_shape_t(nullptr),
|
||||
b_indices_t(nullptr),
|
||||
b_values_t(nullptr),
|
||||
b_shape_t(nullptr) {}
|
||||
Tensor *a_indices_t;
|
||||
Tensor *a_values_t;
|
||||
Tensor *a_shape_t;
|
||||
Tensor *b_indices_t;
|
||||
Tensor *b_values_t;
|
||||
Tensor *b_shape_t;
|
||||
Tensor *output_indices_t;
|
||||
Tensor *output_values_t;
|
||||
};
|
||||
|
||||
class SparseMaximumCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~SparseMaximumCpuKernel() = default;
|
||||
SparseMaximumCpuKernel() = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static void UnionSparseIndicesAndValues(typename TTypes<int64_t>::Matrix a_indices_mat,
|
||||
typename TTypes<T>::Flat a_values, int64_t a_nnz,
|
||||
typename TTypes<int64_t>::Matrix b_indices_mat,
|
||||
typename TTypes<T>::Flat b_values, int64_t b_nnz, int64_t num_dims,
|
||||
std::vector<T> *a_augmented_values, std::vector<T> *b_augmented_values,
|
||||
std::vector<std::pair<bool, int64_t>> *entries_to_copy);
|
||||
|
||||
template <typename T>
|
||||
uint32_t EigenedSparseMax(DataBank &databank);
|
||||
|
||||
static uint32_t NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &calc_info);
|
||||
};
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,207 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "sparse_sparse_minimum.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 2;
|
||||
const uint32_t kInputNum = 6;
|
||||
const char *kSparseSparseMinimum = "SparseSparseMinimum";
|
||||
|
||||
#define SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = SparseSparseMinimumCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("SparseSparseMinimum kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SparseSparseMinimumCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSparseMinimum normal check failed.");
|
||||
|
||||
const Tensor *x1_indices = ctx.Input(0);
|
||||
const Tensor *x1_values_t = ctx.Input(1);
|
||||
const Tensor *x1_shape = ctx.Input(2);
|
||||
const Tensor *x2_indices = ctx.Input(3);
|
||||
const Tensor *x2_values_t = ctx.Input(4);
|
||||
const Tensor *x2_shape = ctx.Input(5);
|
||||
|
||||
auto x1_indices_shape = x1_indices->GetTensorShape();
|
||||
auto x2_indices_shape = x2_indices->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE(((x1_indices_shape->GetDims() == 2) && (x2_indices_shape->GetDims() == 2)),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Input indices should be matrices but received dims: %d and %d.",
|
||||
x1_indices_shape->GetDims(), x2_indices_shape->GetDims())
|
||||
const int64_t x1_nnz = x1_indices_shape->GetDimSize(0);
|
||||
const int64_t x2_nnz = x2_indices_shape->GetDimSize(0);
|
||||
|
||||
auto x1_values_shape = x1_values_t->GetTensorShape();
|
||||
auto x2_values_shape = x2_values_t->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE(((x1_values_shape->GetDims() == 1) && (x2_values_shape->GetDims() == 1)),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Input values should be vectors but received dims: %d and %d.",
|
||||
x1_values_shape->GetDims(), x2_values_shape->GetDims())
|
||||
KERNEL_CHECK_FALSE(((x1_values_t->NumElements() == x1_nnz) && (x2_values_t->NumElements() == x2_nnz)),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected %d and %d non-empty input values, but received : %d and %d.", x1_nnz, x2_nnz,
|
||||
x1_values_t->NumElements(), x2_values_t->NumElements())
|
||||
KERNEL_CHECK_FALSE((x1_values_t->GetDataType() == x2_values_t->GetDataType()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Data types of the input values should be the same, but "
|
||||
"received %d-th and %d-th data type in the DataType enum.",
|
||||
x1_values_t->GetDataType(), x2_values_t->GetDataType())
|
||||
|
||||
auto x1_shape_shape = x1_shape->GetTensorShape();
|
||||
auto x2_shape_shape = x2_shape->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE(((x1_shape_shape->GetDims() == 1) && (x2_shape_shape->GetDims() == 1)),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Input shapes should be vectors but received dims: %d and %d.",
|
||||
x1_shape_shape->GetDims(), x2_shape_shape->GetDims())
|
||||
KERNEL_CHECK_FALSE((x1_shape_shape->GetDimSize(0) == x2_shape_shape->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Operands' should have the same ranks but received: %d and %d.", x1_shape_shape->GetDimSize(0),
|
||||
x2_shape_shape->GetDimSize(0))
|
||||
auto shape_x1 = reinterpret_cast<int64_t *>(x1_shape->GetData());
|
||||
auto shape_x2 = reinterpret_cast<int64_t *>(x2_shape->GetData());
|
||||
for (int i = 0; i < x1_shape->NumElements(); ++i) {
|
||||
KERNEL_CHECK_FALSE(shape_x1[i] == shape_x2[i], KERNEL_STATUS_PARAM_INVALID,
|
||||
"Operands' shapes do not match: got %d and %d for dimension %d", shape_x1[i], shape_x2[i], i)
|
||||
}
|
||||
|
||||
auto data_type = ctx.Input(1)->GetDataType();
|
||||
switch (data_type) {
|
||||
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("SparseSparseMinimum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
int SparseSparseMinimumCpuKernel::cmp(const TTypes<int64_t>::ConstMatrix &x_idx, const int64_t x_row, const int dims,
|
||||
const TTypes<int64_t>::ConstMatrix &y_idx, const int64_t y_row) {
|
||||
for (int d = 0; d < dims; ++d) {
|
||||
const int64_t x = x_idx(x_row, d);
|
||||
const int64_t y = y_idx(y_row, d);
|
||||
if (x < y) {
|
||||
return -1;
|
||||
} else if (x > y) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SparseSparseMinimumCpuKernel::SparseSparseMinimumCompute(CpuKernelContext &ctx) {
|
||||
const EigenTensor x1_indices_ET(ctx.Input(0), ctx.Input(0)->GetData());
|
||||
const EigenTensor x2_indices_ET(ctx.Input(3), ctx.Input(3)->GetData());
|
||||
auto x1_indices_mat = x1_indices_ET.matrix<int64_t>();
|
||||
auto x2_indices_mat = x2_indices_ET.matrix<int64_t>();
|
||||
|
||||
const int64_t x1_nnz = x1_indices_mat.dimension(0);
|
||||
const int64_t x2_nnz = x2_indices_mat.dimension(0);
|
||||
std::vector<std::pair<bool, int64_t>> entries_to_copy;
|
||||
entries_to_copy.reserve(x1_nnz + x2_nnz);
|
||||
std::vector<T> out_values;
|
||||
const int num_dims = ctx.Input(2)->GetTensorShape()->GetDimSize(0);
|
||||
|
||||
EigenTensor x1_values_ET(ctx.Input(1), ctx.Input(1)->GetData());
|
||||
EigenTensor x2_values_ET(ctx.Input(4), ctx.Input(4)->GetData());
|
||||
auto x1_values = x1_values_ET.vec<T>();
|
||||
auto x2_values = x2_values_ET.vec<T>();
|
||||
int64_t i = 0, j = 0;
|
||||
T s;
|
||||
while (i < x1_nnz && j < x2_nnz) {
|
||||
switch (cmp(x1_indices_mat, i, num_dims, x2_indices_mat, j)) {
|
||||
case -1:
|
||||
s = std::min(x1_values(i), T(0));
|
||||
entries_to_copy.emplace_back(true, i);
|
||||
out_values.push_back(s);
|
||||
++i;
|
||||
break;
|
||||
case 0:
|
||||
s = std::min(x1_values(i), x2_values(j));
|
||||
entries_to_copy.emplace_back(true, i);
|
||||
out_values.push_back(s);
|
||||
++i;
|
||||
++j;
|
||||
break;
|
||||
case 1:
|
||||
s = std::min(T(0), x2_values(j));
|
||||
entries_to_copy.emplace_back(false, j);
|
||||
out_values.push_back(s);
|
||||
++j;
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Some inner error happens in the SparseSparseMinimum computation.");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
#define HANDLE_LEFTOVERS(X1_OR_X2, IDX, IS_A) \
|
||||
while ((IDX) < X1_OR_X2##_nnz) { \
|
||||
entries_to_copy.emplace_back(IS_A, IDX); \
|
||||
s = std::min((X1_OR_X2##_values)((IDX)), T(0)); \
|
||||
out_values.push_back(s); \
|
||||
++(IDX); \
|
||||
}
|
||||
|
||||
HANDLE_LEFTOVERS(x1, i, true);
|
||||
HANDLE_LEFTOVERS(x2, j, false);
|
||||
#undef HANDLE_LEFTOVERS
|
||||
|
||||
const int64_t y_nnz = out_values.size();
|
||||
Tensor *out_indices_t = ctx.Output(0);
|
||||
EigenTensor out_indices_ET(out_indices_t, out_indices_t->GetData());
|
||||
auto out_indices_mat = out_indices_ET.matrix<int64_t>();
|
||||
for (int64_t i = 0; i < y_nnz; ++i) {
|
||||
const bool from_x1 = entries_to_copy[i].first;
|
||||
const int64_t idx = entries_to_copy[i].second;
|
||||
out_indices_mat.chip<0>(i) = from_x1 ? x1_indices_mat.chip<0>(idx) : x2_indices_mat.chip<0>(idx);
|
||||
}
|
||||
std::vector<int64_t> indices_dims = {y_nnz, num_dims};
|
||||
auto out_indices_shape = out_indices_t->GetTensorShape();
|
||||
out_indices_shape->SetDimSizes(indices_dims);
|
||||
out_indices_t->SetTensorShape(out_indices_shape.get());
|
||||
|
||||
Tensor *out_values_t = ctx.Output(1);
|
||||
EigenTensor out_values_ET(out_values_t, out_values_t->GetData());
|
||||
auto out_values_flat = out_values_ET.vec<T>();
|
||||
if (y_nnz > 0) {
|
||||
std::copy_n(out_values.begin(), y_nnz, &out_values_flat(0));
|
||||
}
|
||||
std::vector<int64_t> values_dims = {y_nnz};
|
||||
auto out_values_shape = out_values_t->GetTensorShape();
|
||||
out_values_shape->SetDimSizes(values_dims);
|
||||
out_values_t->SetTensorShape(out_values_shape.get());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSparseSparseMinimum, SparseSparseMinimumCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SPARSE_MINIMUM_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSE_SPARSE_MINIMUM_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "eigen_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class SparseSparseMinimumCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SparseSparseMinimumCpuKernel() = default;
|
||||
~SparseSparseMinimumCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static uint32_t SparseSparseMinimumCompute(CpuKernelContext &ctx);
|
||||
|
||||
static int cmp(const TTypes<int64_t>::ConstMatrix &x_idx, const int64_t x_row, const int dims,
|
||||
const TTypes<int64_t>::ConstMatrix &y_idx, const int64_t y_row);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,301 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparseaddmm.h"
|
||||
#include <securec.h>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 7;
|
||||
const char *kSparseAddmm = "SparseAddmm";
|
||||
constexpr int64_t kParallelDataNums = 16;
|
||||
|
||||
#define SPARSEADDMM_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
if (indices_type == DT_INT64) { \
|
||||
uint32_t result = SparseAddmmCompute<TYPE, int64_t>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("SparseAddmm kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
} else { \
|
||||
uint32_t result = SparseAddmmCompute<TYPE, int32_t>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("SparseAddmm kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
} \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SparseAddmmCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSparseAddmm);
|
||||
KERNEL_HANDLE_ERROR(SparseAddmmCheck(ctx), "[%s] check params failed.", kSparseAddmm);
|
||||
DataType data_type = ctx.Input(1)->GetDataType();
|
||||
DataType data_type1 = ctx.Input(3)->GetDataType();
|
||||
DataType indices_type = ctx.Input(0)->GetDataType();
|
||||
if (data_type != data_type1) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"sparse data type is no equal dense data type, sparsetype [%d], "
|
||||
"densetype [%d].",
|
||||
data_type, data_type1);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
switch (data_type) {
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
SPARSEADDMM_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("SparseAddmm kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SparseAddmmCpuKernel::SparseAddmmCheck(CpuKernelContext &ctx) {
|
||||
Tensor *indices_tensor = ctx.Input(0);
|
||||
Tensor *values_tensor = ctx.Input(1);
|
||||
Tensor *shape_tensor = ctx.Input(2);
|
||||
Tensor *dense_tensor = ctx.Input(3);
|
||||
Tensor *alpha_tensor = ctx.Input(5);
|
||||
Tensor *beta_tensor = ctx.Input(6);
|
||||
|
||||
if (alpha_tensor->GetTensorShape()->NumElements() != 1) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"alpha_tensor should be a number,but got NumElements "
|
||||
"[%d].",
|
||||
alpha_tensor->GetTensorShape()->NumElements());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (beta_tensor->GetTensorShape()->NumElements() != 1) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"beta_tensor should be a number,but got NumElements "
|
||||
"[%d].",
|
||||
beta_tensor->GetTensorShape()->NumElements());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
// valid shape nullptr
|
||||
auto sparse_shape = shape_tensor->GetTensorShape();
|
||||
auto values_shape = values_tensor->GetTensorShape();
|
||||
auto dense_tensor_shape = dense_tensor->GetTensorShape();
|
||||
auto indices_shape = indices_tensor->GetTensorShape();
|
||||
// sparse_indices
|
||||
if (indices_shape->GetDims() > 2) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"Sparse_indices should be a scalar, vector, or matrix, got dim "
|
||||
"size [%d].",
|
||||
indices_shape->GetDims());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
const int64_t elems_num = indices_shape->GetDims() > 0 ? indices_shape->GetDimSize(0) : 1;
|
||||
const int64_t dims_num = indices_shape->GetDims() > 1 ? indices_shape->GetDimSize(1) : 1;
|
||||
|
||||
// output_shape
|
||||
if (sparse_shape->GetDims() != 1) {
|
||||
KERNEL_LOG_ERROR("Sparse_shape should be a vector, got dim size [%d].", sparse_shape->GetDims());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (shape_tensor->NumElements() != dims_num) {
|
||||
KERNEL_LOG_ERROR("Sparse_shape has incorrect number of elements [%lld], should be [%lld]",
|
||||
shape_tensor->NumElements(), dims_num);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
// valid data type
|
||||
int32_t IndiceType = indices_tensor->GetDataType();
|
||||
int32_t ShapeType = shape_tensor->GetDataType();
|
||||
bool validIndiceType = ((IndiceType != DT_INT32) && (IndiceType != DT_INT64));
|
||||
bool validShapeType = ((ShapeType != DT_INT32) && (ShapeType != DT_INT64));
|
||||
if (validShapeType || validIndiceType) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"Valid indice or Sparse shape data type failed, indiceType [%d], "
|
||||
"shapeType [%d].",
|
||||
IndiceType, ShapeType);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
// sparse_values
|
||||
int32_t values_dims_size = values_shape->GetDims();
|
||||
if ((values_dims_size != 0) && (values_dims_size != 1)) {
|
||||
KERNEL_LOG_ERROR("Values_shape should be a scalar or a vector, got dim size [%d].", values_shape->GetDims());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if ((values_dims_size == 1) && (values_tensor->NumElements() != elems_num)) {
|
||||
KERNEL_LOG_ERROR("Values_shape has incorrect number of elements [%lld], should be [%lld]",
|
||||
values_tensor->NumElements(), elems_num);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, typename T1>
|
||||
uint32_t SparseAddmmCpuKernel::SparseAddmmCompute(CpuKernelContext &ctx) {
|
||||
auto *indices_tensor = ctx.Input(0);
|
||||
auto *values_tensor = ctx.Input(1);
|
||||
auto *shape_tensor = ctx.Input(2);
|
||||
auto *dense_tensor = ctx.Input(3);
|
||||
auto *x3_dense_tensor = ctx.Input(4);
|
||||
auto *alpha_tensor = ctx.Input(5);
|
||||
auto *beta_tensor = ctx.Input(6);
|
||||
auto *output_tensor = ctx.Output(0);
|
||||
|
||||
// auto indices = reinterpret_cast<int64_t *>(indices_tensor->GetData());
|
||||
auto values = reinterpret_cast<T *>(values_tensor->GetData());
|
||||
auto dense_data = reinterpret_cast<T *>(dense_tensor->GetData());
|
||||
auto x3_dense_data = reinterpret_cast<T *>(x3_dense_tensor->GetData());
|
||||
auto alpha = reinterpret_cast<T *>(alpha_tensor->GetData());
|
||||
auto beta = reinterpret_cast<T *>(beta_tensor->GetData());
|
||||
auto y = reinterpret_cast<T *>(output_tensor->GetData());
|
||||
|
||||
std::vector<int64_t> temp_shape;
|
||||
for (int32_t index = 0; index < shape_tensor->GetTensorShape()->GetDimSize(0); ++index) {
|
||||
if (shape_tensor->GetDataType() == DT_INT32) {
|
||||
int32_t *temp_dim = reinterpret_cast<int32_t *>(shape_tensor->GetData());
|
||||
temp_shape.emplace_back(static_cast<int64_t>(temp_dim[index]));
|
||||
} else {
|
||||
int64_t *temp_dim = reinterpret_cast<int64_t *>(shape_tensor->GetData());
|
||||
temp_shape.emplace_back(temp_dim[index]);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t row_x1 = temp_shape[0];
|
||||
const int64_t col_x1 = temp_shape[1];
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> sparse(row_x1, col_x1);
|
||||
sparse.setZero(row_x1, col_x1);
|
||||
std::vector<int64_t> temp_indices;
|
||||
auto indices_one = indices_tensor->GetTensorShape()->GetDimSize(0);
|
||||
auto indices_two = indices_tensor->GetTensorShape()->GetDimSize(1);
|
||||
for (int32_t index = 0; index < indices_one; ++index) {
|
||||
if (indices_tensor->GetDataType() == DT_INT32) {
|
||||
int32_t *temp_dim = reinterpret_cast<int32_t *>(indices_tensor->GetData());
|
||||
temp_indices.emplace_back(static_cast<int64_t>(temp_dim[index * indices_two + 0]));
|
||||
temp_indices.emplace_back(static_cast<int64_t>(temp_dim[index * indices_two + 1]));
|
||||
} else {
|
||||
int64_t *temp_dim = reinterpret_cast<int64_t *>(indices_tensor->GetData());
|
||||
temp_indices.emplace_back(temp_dim[index * indices_two + 0]);
|
||||
temp_indices.emplace_back(temp_dim[index * indices_two + 1]);
|
||||
}
|
||||
}
|
||||
|
||||
if (indices_one <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < indices_one; i++) {
|
||||
int64_t row = temp_indices[i * indices_two + 0];
|
||||
int64_t col = temp_indices[i * indices_two + 1];
|
||||
sparse(row, col) = *(values + i);
|
||||
}
|
||||
} else {
|
||||
uint32_t minCoreNum = 1;
|
||||
int64_t maxCoreNum = std::max(minCoreNum, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
auto shardSparse = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
int64_t row = temp_indices[i * indices_two + 0];
|
||||
int64_t col = temp_indices[i * indices_two + 1];
|
||||
sparse(row, col) = *(values + i);
|
||||
}
|
||||
};
|
||||
CpuKernelUtils::ParallelFor(ctx, indices_one, indices_one / maxCoreNum, shardSparse);
|
||||
}
|
||||
|
||||
std::vector<int64_t> shape_x2 = dense_tensor->GetTensorShape()->GetDimSizes();
|
||||
const int64_t row_x2 = shape_x2[0];
|
||||
const int64_t col_x2 = shape_x2[1];
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> dense(row_x2, col_x2);
|
||||
|
||||
std::vector<int64_t> shape_x3 = x3_dense_tensor->GetTensorShape()->GetDimSizes();
|
||||
const int64_t row_x3 = shape_x3[0];
|
||||
const int64_t col_x3 = shape_x3[1];
|
||||
|
||||
if (row_x3 != row_x1) {
|
||||
KERNEL_LOG_ERROR("x1's row is no equal x3's row, cannot do add!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (col_x3 != col_x2) {
|
||||
KERNEL_LOG_ERROR("x2's col is no equal x3's col, cannot do add!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (row_x2 <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < row_x2; i++) {
|
||||
for (int64_t j = 0; j < col_x2; j++) {
|
||||
dense(i, j) = *(dense_data + i * col_x2 + j);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint32_t minCoreNum = 1;
|
||||
int64_t maxCoreNum = std::max(minCoreNum, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
auto shardDense = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
for (int64_t j = 0; j < col_x2; j++) {
|
||||
dense(i, j) = *(dense_data + i * col_x2 + j);
|
||||
}
|
||||
}
|
||||
};
|
||||
CpuKernelUtils::ParallelFor(ctx, row_x2, row_x2 / maxCoreNum, shardDense);
|
||||
}
|
||||
|
||||
if (col_x1 != row_x2) {
|
||||
KERNEL_LOG_ERROR("x1's col is no equal x2's row, cannot do mat mul!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> temp;
|
||||
temp = sparse * dense;
|
||||
|
||||
if (row_x1 <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < row_x1; i++) {
|
||||
for (int64_t j = 0; j < col_x2; j++) {
|
||||
*(y + i * col_x2 + j) = *(alpha + 0) * temp(i, j);
|
||||
*(y + i * col_x2 + j) += *(beta + 0) * (*(x3_dense_data + i * col_x2 + j));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint32_t minCoreNum = 1;
|
||||
int64_t maxCoreNum = std::max(minCoreNum, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
auto shardMatMul = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
for (int64_t j = 0; j < col_x2; j++) {
|
||||
*(y + i * col_x2 + j) = *(alpha + 0) * temp(i, j);
|
||||
*(y + i * col_x2 + j) += *(beta + 0) * (*(x3_dense_data + i * col_x2 + j));
|
||||
}
|
||||
}
|
||||
};
|
||||
CpuKernelUtils::ParallelFor(ctx, row_x1, row_x1 / maxCoreNum, shardMatMul);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSparseAddmm, SparseAddmmCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSEADDMM_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSEADDMM_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/sparse_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class SparseAddmmCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~SparseAddmmCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx);
|
||||
|
||||
uint32_t SparseAddmmCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T, typename T1>
|
||||
uint32_t SparseAddmmCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,169 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparsefillemptyrowsgrad.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
#include <numeric>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/allocator_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
|
||||
namespace {
|
||||
const char *kSparseFillEmptyRowsGrad = "SparseFillEmptyRowsGrad";
|
||||
const uint32_t kOutputNum = 2;
|
||||
const uint32_t kInputNum = 2;
|
||||
const int64_t kParallelNum{16384};
|
||||
|
||||
bool isVector(const std::shared_ptr<aicpu::TensorShape> shape) { return shape->GetDims() == 1; }
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
uint32_t SparseFillEmptyRowsGradCpuKernel::ComputeSparseFillEmptyRowsGrad(CpuKernelContext &ctx, DataBank &databank) {
|
||||
EigenTensor reverse_index_map_e(databank.reverse_index_map, databank.reverse_index_map->GetData());
|
||||
EigenTensor grad_values_e(databank.grad_values, databank.grad_values->GetData());
|
||||
EigenTensor y_value_e(databank.y_value, databank.y_value->GetData());
|
||||
|
||||
auto reverse_index_map = reverse_index_map_e.vec<int64_t>();
|
||||
auto grad_values = grad_values_e.vec<T>();
|
||||
auto y_value = y_value_e.vec<T>();
|
||||
|
||||
const int64_t N = databank.reverse_index_map->GetTensorShape()->GetDimSize(0);
|
||||
const int64_t N_full = databank.grad_values->GetTensorShape()->GetDimSize(0);
|
||||
|
||||
std::vector<bool> visited(N_full, false);
|
||||
T *y_default_value = reinterpret_cast<T *>(databank.y_default_value->GetData());
|
||||
*y_default_value = static_cast<T>(0);
|
||||
if (N <= kParallelNum) {
|
||||
for (int64_t i = 0; i < N; ++i) {
|
||||
int64_t reverse_index = reverse_index_map(i);
|
||||
KERNEL_CHECK_FALSE(0 <= reverse_index && reverse_index < N_full, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Elements in reverse index must be in [0, [%d]) but got [%d]", N_full, reverse_index)
|
||||
y_value(i) = grad_values(reverse_index);
|
||||
visited[reverse_index] = true;
|
||||
}
|
||||
} else {
|
||||
int64_t total = N;
|
||||
uint32_t cores = CpuKernelUtils::GetCPUNum(ctx);
|
||||
int64_t per_unit_size = (total / std::min(std::max(1L, cores - 2L), total));
|
||||
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, total, per_unit_size, [&](int64_t begin, int64_t end) {
|
||||
for (int64_t i = begin; i < end; ++i) {
|
||||
int64_t reverse_index = reverse_index_map(i);
|
||||
KERNEL_CHECK_FALSE_VOID(0 <= reverse_index && reverse_index < N_full,
|
||||
"Elements in reverse index must be in [0, [%d]) but got [%d]", N_full, reverse_index);
|
||||
y_value(i) = grad_values(reverse_index);
|
||||
visited[reverse_index] = true;
|
||||
}
|
||||
});
|
||||
KERNEL_CHECK_FALSE((ret == KERNEL_STATUS_OK), KERNEL_STATUS_INNER_ERROR, "SparseFillEmptyRowsGrad compute failed.");
|
||||
}
|
||||
for (int64_t j = 0; j < N_full; ++j) {
|
||||
if (!visited[j]) {
|
||||
(*y_default_value) += grad_values(j);
|
||||
}
|
||||
}
|
||||
databank.y_default_value->GetTensorShape()->SetDimSizes({});
|
||||
databank.y_value->GetTensorShape()->SetDimSizes({N});
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SparseFillEmptyRowsGradCpuKernel::NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &databank) {
|
||||
databank.reverse_index_map = ctx.Input(0);
|
||||
databank.grad_values = ctx.Input(1);
|
||||
databank.y_value = ctx.Output(0);
|
||||
databank.y_default_value = ctx.Output(1);
|
||||
KERNEL_CHECK_FALSE(isVector(databank.reverse_index_map->GetTensorShape()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Inputs reverse_index_map should be vectors")
|
||||
KERNEL_CHECK_FALSE(isVector(databank.grad_values->GetTensorShape()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Inputs grad_values should be vectors")
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SparseFillEmptyRowsGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"SparseFillEmptyRowsGrad check input and output number failed.");
|
||||
DataBank databank;
|
||||
KERNEL_HANDLE_ERROR(NullptrAndMatVecCheck(ctx, databank), "SparseFillEmptyRowsGrad check params failed.");
|
||||
DataType dt = static_cast<DataType>(databank.y_value->GetDataType());
|
||||
|
||||
uint32_t KERNEL_STATUS;
|
||||
switch (dt) {
|
||||
case DT_INT8:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int8_t>(ctx, databank);
|
||||
break;
|
||||
case DT_UINT8:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint8_t>(ctx, databank);
|
||||
break;
|
||||
case DT_INT16:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int16_t>(ctx, databank);
|
||||
break;
|
||||
case DT_UINT16:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint16_t>(ctx, databank);
|
||||
break;
|
||||
case DT_INT32:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int32_t>(ctx, databank);
|
||||
break;
|
||||
case DT_UINT32:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint32_t>(ctx, databank);
|
||||
break;
|
||||
case DT_INT64:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int64_t>(ctx, databank);
|
||||
break;
|
||||
case DT_UINT64:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint64_t>(ctx, databank);
|
||||
break;
|
||||
case DT_BOOL:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<bool>(ctx, databank);
|
||||
break;
|
||||
case DT_STRING:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<std::string>(ctx, databank);
|
||||
break;
|
||||
case DT_FLOAT16:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<Eigen::half>(ctx, databank);
|
||||
break;
|
||||
case DT_FLOAT:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<float>(ctx, databank);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<double>(ctx, databank);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<std::complex<float>>(ctx, databank);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<std::complex<double>>(ctx, databank);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("SparseFillEmptyRowsGrad can't support this data type [%s].", DTypeStr(dt).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (KERNEL_STATUS != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("SparseFillEmptyRowsGrad failed.");
|
||||
return KERNEL_STATUS;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSparseFillEmptyRowsGrad, SparseFillEmptyRowsGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,45 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <set>
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/sparse_group.h"
|
||||
#include "utils/sparse_tensor.h"
|
||||
// 定义命名空间aicpu
|
||||
|
||||
namespace aicpu {
|
||||
struct DataBank {
|
||||
DataBank() : reverse_index_map(nullptr), grad_values(nullptr), y_value(nullptr), y_default_value(nullptr) {}
|
||||
Tensor *reverse_index_map;
|
||||
Tensor *grad_values;
|
||||
Tensor *y_value;
|
||||
Tensor *y_default_value;
|
||||
};
|
||||
|
||||
// 算子类继承CpuKernel基类
|
||||
class SparseFillEmptyRowsGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~SparseFillEmptyRowsGradCpuKernel() = default;
|
||||
SparseFillEmptyRowsGradCpuKernel() = default;
|
||||
// 声明函数Compute,且Compute函数需要重写
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &calc_info);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ComputeSparseFillEmptyRowsGrad(CpuKernelContext &ctx, DataBank &databank);
|
||||
};
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,190 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "split.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kSplit = "Split";
|
||||
constexpr uint32_t kSplitInputNum = 2;
|
||||
std::vector<std::string> attr_names;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SplitCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
AttrValue *num_split_ptr = ctx.GetAttr("num_split");
|
||||
num_split_ = num_split_ptr->GetInt();
|
||||
uint32_t kSplitOutputNum = num_split_ptr->GetInt();
|
||||
attr_names.emplace_back("num_split");
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSplitInputNum, kSplitOutputNum, attr_names), "[%s] check params failed.",
|
||||
kSplit);
|
||||
KERNEL_CHECK_FALSE((num_split_ >= 1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Attr num_split must >= 1, but got attr num_split[%lld]", num_split_);
|
||||
Tensor *split_dim_ptr = ctx.Input(0);
|
||||
auto split_dim_shape_ptr = split_dim_ptr->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((split_dim_shape_ptr->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input split_dim should be a scalar integer, but got rank[%lld]", split_dim_shape_ptr->GetDims());
|
||||
KERNEL_CHECK_FALSE((split_dim_ptr->GetDataType() == DT_INT32), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input split_dim data type must be DT_INT32, but got data type[%s]",
|
||||
DTypeStr(split_dim_ptr->GetDataType()).c_str());
|
||||
auto split_dim_data_ptr = split_dim_ptr->GetData();
|
||||
KERNEL_CHECK_NULLPTR(split_dim_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input split_dim data failed.");
|
||||
split_dim_ = *(reinterpret_cast<int32_t *>(split_dim_data_ptr));
|
||||
Tensor *value_ptr = ctx.Input(1);
|
||||
value_data_ptr_ = value_ptr->GetData();
|
||||
auto value_shape_ptr = value_ptr->GetTensorShape();
|
||||
int64_t value_dim = value_shape_ptr->GetDims();
|
||||
if (split_dim_ < 0) {
|
||||
split_dim_ += value_dim;
|
||||
}
|
||||
KERNEL_CHECK_FALSE(value_dim > split_dim_, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Dim of Input value must greater than split_dim, value dim is [%d], split_dim is [%d].", value_dim,
|
||||
num_split_);
|
||||
value_shape_vec_ = value_shape_ptr->GetDimSizes();
|
||||
data_type_ = value_ptr->GetDataType();
|
||||
value_num_ = value_ptr->NumElements();
|
||||
KERNEL_CHECK_FALSE((value_shape_ptr->GetDimSize(split_dim_) % num_split_ == 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Number of ways to split should evenly divide the split "
|
||||
"dimension, but got split_dim [%d] (size = [%lld]) and num_split is [%lld]",
|
||||
split_dim_, value_shape_ptr->GetDimSize(split_dim_), num_split_);
|
||||
output_ptr_vec_.resize(num_split_);
|
||||
for (int64_t i = 0; i < num_split_; i++) {
|
||||
Tensor *output_ptr = ctx.Output(i);
|
||||
auto output_data_ptr = output_ptr->GetData();
|
||||
output_ptr_vec_[i] = output_data_ptr;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SplitCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
T *input_data_ptr = static_cast<T *>(value_data_ptr_);
|
||||
std::vector<T *> output_data_vec;
|
||||
output_data_vec.resize(num_split_);
|
||||
for (int64_t i = 0; i < num_split_; i++) {
|
||||
output_data_vec[i] = reinterpret_cast<T *>(output_ptr_vec_[i]);
|
||||
}
|
||||
|
||||
if (num_split_ == 1) {
|
||||
KERNEL_CHECK_FALSE((SplitWithOneOutput<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
|
||||
KERNEL_STATUS_PARAM_INVALID, "SplitWithOneOutput failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
if (split_dim_ == 0) {
|
||||
KERNEL_CHECK_FALSE((SplitWithDimZero<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
|
||||
KERNEL_STATUS_PARAM_INVALID, "SplitWithDimZero failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
KERNEL_CHECK_FALSE((SplitCompute<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Split Compute failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SplitCpuKernel::SplitWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec) {
|
||||
int64_t copy_size = value_num_ * sizeof(T);
|
||||
auto mem_ret = memcpy_s(output_data_vec[0], copy_size, input_data_ptr, copy_size);
|
||||
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Memcpy size[%zu] from input value to output[0] failed.", copy_size);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SplitCpuKernel::SplitWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec) {
|
||||
int64_t copy_num = value_num_ / value_shape_vec_[0];
|
||||
T *input_copy_ptr = input_data_ptr;
|
||||
const int64_t split_dim_output_size = value_shape_vec_[0] / num_split_;
|
||||
for (int32_t i = 0; i < num_split_; i++) {
|
||||
int64_t copy_size_per = copy_num * split_dim_output_size;
|
||||
int64_t copy_size = copy_size_per * sizeof(T);
|
||||
auto mem_ret = memcpy_s(output_data_vec[i], copy_size, input_copy_ptr, copy_size);
|
||||
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
|
||||
input_copy_ptr += copy_size_per;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SplitCpuKernel::SplitCompute(T *input_data_ptr, std::vector<T *> output_data_vec) {
|
||||
int64_t prefix = 1;
|
||||
for (int32_t i = 0; i < split_dim_; ++i) {
|
||||
prefix *= value_shape_vec_[i];
|
||||
}
|
||||
int64_t midfix = value_shape_vec_[split_dim_];
|
||||
int64_t subfix = 1;
|
||||
for (size_t i = split_dim_ + 1; i < value_shape_vec_.size(); i++) {
|
||||
subfix *= value_shape_vec_[i];
|
||||
}
|
||||
const int64_t split_dim_output_size = midfix / num_split_;
|
||||
int64_t offset = 0;
|
||||
for (int64_t i = 0; i < num_split_; ++i) {
|
||||
T *output_data_ptr = output_data_vec[i];
|
||||
T *input_copy_ptr = input_data_ptr + offset;
|
||||
int64_t copy_num = subfix * split_dim_output_size;
|
||||
int64_t copy_size = copy_num * sizeof(T);
|
||||
for (int64_t j = 0; j < prefix; j++) {
|
||||
auto mem_ret = memcpy_s(output_data_ptr, copy_size, input_copy_ptr, copy_size);
|
||||
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
|
||||
input_copy_ptr += (subfix * midfix);
|
||||
output_data_ptr += copy_num;
|
||||
}
|
||||
offset += copy_num;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SplitCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_CHECK_FALSE((CheckAndInitParams(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"CheckAndInitParams failed.");
|
||||
switch (data_type_) {
|
||||
case DT_FLOAT16:
|
||||
return DoCompute<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return DoCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<double>(ctx);
|
||||
case DT_BOOL:
|
||||
return DoCompute<bool>(ctx);
|
||||
case DT_INT8:
|
||||
return DoCompute<int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return DoCompute<int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return DoCompute<uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return DoCompute<uint16_t>(ctx);
|
||||
case DT_UINT32:
|
||||
return DoCompute<uint32_t>(ctx);
|
||||
case DT_UINT64:
|
||||
return DoCompute<uint64_t>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return DoCompute<std::complex<float>>(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return DoCompute<std::complex<double>>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported datatype[%s]", DTypeStr(data_type_).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSplit, SplitCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,84 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPLIT_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPLIT_H_
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "securec.h"
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SplitCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SplitCpuKernel() = default;
|
||||
~SplitCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Init params
|
||||
* @param ctx cpu kernel context
|
||||
* @return status if success
|
||||
*/
|
||||
uint32_t CheckAndInitParams(CpuKernelContext &ctx);
|
||||
/**
|
||||
* @brief split data when split num is 1
|
||||
* @param input_data_ptr ptr which store input data
|
||||
* @param output_data_vec vector which store all output data ptr
|
||||
* @return status if success
|
||||
*/
|
||||
template <typename T>
|
||||
uint32_t SplitWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec);
|
||||
/**
|
||||
* @brief split data when split dim is 0
|
||||
* @param input_data_ptr ptr which store input data
|
||||
* @param output_data_vec vector which store all output data ptr
|
||||
* @return status if success
|
||||
*/
|
||||
template <typename T>
|
||||
uint32_t SplitWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec);
|
||||
/**
|
||||
* @brief split data
|
||||
* @param input_data_ptr ptr which store input data
|
||||
* @param output_data_vec vector which store all output data ptr
|
||||
* @return status if success
|
||||
*/
|
||||
template <typename T>
|
||||
uint32_t SplitCompute(T *input_data_ptr, std::vector<T *> output_data_vec);
|
||||
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
|
||||
private:
|
||||
DataType data_type_;
|
||||
int32_t split_dim_;
|
||||
int64_t num_split_;
|
||||
int64_t value_num_;
|
||||
void *value_data_ptr_;
|
||||
std::vector<void *> output_ptr_vec_;
|
||||
std::vector<int64_t> value_shape_vec_;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,142 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "sqrt.h"
|
||||
|
||||
#include <complex>
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const std::uint32_t kSqrtInputNum{1};
|
||||
const std::uint32_t kSqrtOutputNum{1};
|
||||
const std::uint32_t Parallel4ThreadNum{4096};
|
||||
const std::uint32_t Parallel6ThreadNum{8192};
|
||||
const std::uint32_t ParallelNum{16384};
|
||||
const char *kSqrt{"Sqrt"};
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
namespace detail {
|
||||
template <typename T>
|
||||
inline std::uint32_t ComputeSqrtKernel(const CpuKernelContext &ctx) {
|
||||
const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
|
||||
auto input = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
std::int64_t total = ctx.Input(0)->NumElements();
|
||||
std::uint64_t total_size = ctx.Input(0)->GetDataSize();
|
||||
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
|
||||
bool parallel_flag = false;
|
||||
if (total_size > ParallelNum * sizeof(T)) {
|
||||
parallel_flag = true;
|
||||
} else if (total_size > Parallel6ThreadNum * sizeof(T)) {
|
||||
parallel_flag = true;
|
||||
cores = 8;
|
||||
} else if (total_size > Parallel4ThreadNum * sizeof(T)) {
|
||||
parallel_flag = true;
|
||||
cores = 6;
|
||||
}
|
||||
if (parallel_flag) {
|
||||
std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
|
||||
return ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
|
||||
std::transform(input + begin, input + end, output + begin, Eigen::numext::sqrt<T>);
|
||||
});
|
||||
} else if (cores != 0) {
|
||||
std::transform(input, input + total, output, Eigen::numext::sqrt<T>);
|
||||
} else {
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::uint32_t ComputeSqrt(const CpuKernelContext &ctx) {
|
||||
uint32_t result = ComputeSqrtKernel<T>(ctx);
|
||||
if (result != 0) {
|
||||
KERNEL_LOG_ERROR("Sqrt compute failed.");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::uint32_t SqrtExtraCheck(const CpuKernelContext &ctx) {
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
|
||||
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get input data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Output(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get output data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
|
||||
if (input_dims.size() != output_dims.size()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data dim of the input size [%llu] need be the same as the output "
|
||||
"size [%llu].",
|
||||
input_dims.size(), output_dims.size());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
for (size_t index = 0; index < input_dims.size(); index++) {
|
||||
if (input_dims[index] != output_dims[index]) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data dim[%llu]=%lld of the input need be the same as the output "
|
||||
"dim[%llu]=%lld.",
|
||||
index, input_dims[index], index, output_dims[index]);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
std::uint32_t SqrtCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
|
||||
return NormalCheck(ctx, kSqrtInputNum, kSqrtOutputNum) ? KERNEL_STATUS_PARAM_INVALID : SqrtExtraCheck(ctx);
|
||||
}
|
||||
|
||||
std::uint32_t SqrtCompute(const CpuKernelContext &ctx) {
|
||||
DataType input_type{ctx.Input(0)->GetDataType()};
|
||||
switch (input_type) {
|
||||
case DT_FLOAT16:
|
||||
return ComputeSqrt<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return ComputeSqrt<std::float_t>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return ComputeSqrt<std::double_t>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return ComputeSqrt<std::complex<std::float_t> >(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return ComputeSqrt<std::complex<std::double_t> >(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
std::uint32_t SqrtCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
return detail::SqrtCheck(ctx, kSqrtInputNum, kSqrtOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::SqrtCompute(ctx);
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSqrt, SqrtCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,25 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SQRT_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SQRT_H_
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SqrtCpuKernel final : public CpuKernel {
|
||||
virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,248 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sqrtgrad.h"
|
||||
|
||||
#include <complex>
|
||||
#include <cstdint>
|
||||
#include <typeinfo>
|
||||
#include "Eigen/Dense"
|
||||
|
||||
#include <iostream>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "kernel_log.h"
|
||||
#include "securec.h"
|
||||
#include "status.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kSqrtGrad = "SqrtGrad";
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
|
||||
#define SQRTGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = SqrtGradCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("SqrtGrad kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define SQRTGRAD_COMPUTE_COMPLEX_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = SqrtGradComputeComplex<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("SqrtGrad kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SqrtGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSqrtGrad);
|
||||
KERNEL_HANDLE_ERROR(SqrtGradParamCheck(ctx), "[%s] check params failed.", kSqrtGrad);
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
SQRTGRAD_COMPUTE_COMPLEX_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
SQRTGRAD_COMPUTE_COMPLEX_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
SQRTGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
SQRTGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
SQRTGRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
|
||||
default:
|
||||
KERNEL_LOG_ERROR("SqrtGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SqrtGradCpuKernel::SqrtGradParamCheck(CpuKernelContext &ctx) {
|
||||
// the non null of input_0, input_1, output has been verified in NormalCheck
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"SqrtGradCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
/**
|
||||
special compute is used in the following situations.
|
||||
1. the shapes of input1 and input2 are the same
|
||||
2. input1 is a 1D tensor with only one element or input1 is scalar
|
||||
3. input2 is a 1D tensor with only one element or input2 is scalar
|
||||
4. the shapes of input1 and input2 are different
|
||||
*/
|
||||
template <typename T>
|
||||
void SqrtGradCpuKernel::SpecialCompute(int64_t start, int64_t end, T *input1, T *input2, T *output) {
|
||||
int flag = 0;
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(input2 + i) == static_cast<T>(0)) {
|
||||
flag = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(output + i) = *(input2 + i) * static_cast<T>(0.5) / *(input1 + i);
|
||||
}
|
||||
|
||||
if (flag == 1) KERNEL_LOG_WARN("divide by zero encountered");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SqrtGradCpuKernel::SpecialComputeComplex(int64_t start, int64_t end, T *input1, T *input2, T *output) {
|
||||
int flag = 0;
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(input2 + i) == static_cast<T>(0)) {
|
||||
flag = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
T in1 = *(input1 + i);
|
||||
T in1_conj = std::conj(in1);
|
||||
if (in1_conj == static_cast<T>(0)) {
|
||||
*(output + i) = INFINITY;
|
||||
} else {
|
||||
*(output + i) = *(input2 + i) * static_cast<T>(0.5) / in1_conj;
|
||||
}
|
||||
}
|
||||
if (flag == 1) KERNEL_LOG_WARN("divide by zero encountered");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SqrtGradCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t data_num = in0_elements_nums;
|
||||
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_sqrtgrad = [&](size_t start, size_t end) { SpecialCompute<T>(0, data_num, in0, in1, out); };
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_sqrtgrad),
|
||||
"SqrtGrad Compute failed.");
|
||||
} else {
|
||||
SpecialCompute<T>(0, data_num, in0, in1, out);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SqrtGradCpuKernel::NoBcastComputeComplex(CpuKernelContext &ctx) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t data_num = in0_elements_nums;
|
||||
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_sqrtgrad = [&](size_t start, size_t end) { SpecialComputeComplex<T>(0, data_num, in0, in1, out); };
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_sqrtgrad),
|
||||
"SqrtGrad Compute failed.");
|
||||
} else {
|
||||
SpecialComputeComplex<T>(0, data_num, in0, in1, out);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SqrtGradCpuKernel::SqrtGradCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
|
||||
if (input0_elements_nums != input1_elements_nums) {
|
||||
KERNEL_LOG_WARN("Invalid element numbers, got[%d] and [%d]", static_cast<int32_t>(input0_elements_nums),
|
||||
static_cast<int32_t>(input1_elements_nums));
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
return NoBcastCompute<T>(ctx);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SqrtGradCpuKernel::SqrtGradComputeComplex(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
|
||||
if (input0_elements_nums != input1_elements_nums) {
|
||||
KERNEL_LOG_WARN("Invalid element numbers, got[%d] and [%d]", static_cast<int32_t>(input0_elements_nums),
|
||||
static_cast<int32_t>(input1_elements_nums));
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
return NoBcastComputeComplex<T>(ctx);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSqrtGrad, SqrtGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SQRTGRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SQRTGRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SqrtGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SqrtGradCpuKernel() = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t SqrtGradParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void SpecialCompute(int64_t start, int64_t end, T *input1, T *input2, T *output);
|
||||
template <typename T>
|
||||
void SpecialComputeComplex(int64_t start, int64_t end, T *input1, T *input2, T *output);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastComputeComplex(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SqrtGradCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SqrtGradComputeComplex(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,85 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "tanh.h"
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "cmath"
|
||||
#include <complex>
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *kTanh = "Tanh";
|
||||
constexpr int64_t kParallelDataNums = 128 * 1024;
|
||||
|
||||
#define Tanh_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = TanhCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Tanh kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t TanhCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kTanh);
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
Tanh_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
Tanh_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx) Tanh_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
Tanh_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx) Tanh_COMPUTE_CASE(DT_DOUBLE, double, ctx) default
|
||||
: KERNEL_LOG_ERROR("Tanh kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t TanhCpuKernel::TanhCompute(CpuKernelContext &ctx) {
|
||||
Eigen::internal::scalar_tanh_op<T> tanh_op;
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
size_t data_num = ctx.Input(0)->NumElements();
|
||||
int64_t data_size = data_num * sizeof(T);
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
auto x_idx = input_x + i; // i-th value of input0
|
||||
*(output_y + i) = tanh_op((*x_idx));
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
auto shard_Tanh = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto x_idx = input_x + i; // i-th value of input0
|
||||
*(output_y + i) = tanh_op((*x_idx));
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_Tanh),
|
||||
"Tanh Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kTanh, TanhCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,34 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_TANH_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_TANH_H_
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class TanhCpuKernel : public CpuKernel {
|
||||
public:
|
||||
TanhCpuKernel() = default;
|
||||
~TanhCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t TanhCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,156 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
#include "tile.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "Eigen/Core"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kTile = "Tile";
|
||||
|
||||
#define TILE_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = TileCompute<TYPE1, TYPE2>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Tile kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define TILE_COMPUTE_CASE_ALL(TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX) \
|
||||
TILE_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t TileCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Tile check input and output number failed.");
|
||||
Tensor *input_x0 = ctx.Input(0);
|
||||
Tensor *input_x1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
auto size_0 = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
auto size_1 = ctx.Input(1)->GetTensorShape()->GetDims();
|
||||
KERNEL_CHECK_FALSE((size_0 >= 1), KERNEL_STATUS_PARAM_INVALID, "Dimension of x must be 1 or higher, but got[%zu].",
|
||||
size_0);
|
||||
KERNEL_CHECK_FALSE((size_1 == 1), KERNEL_STATUS_PARAM_INVALID, "Dimension of multiples must be 1, but got[%zu].",
|
||||
size_1);
|
||||
KERNEL_CHECK_FALSE((size_0 == input_x1->NumElements()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Multiples length must be the same as the number of dimensions in x.");
|
||||
KERNEL_LOG_DEBUG(
|
||||
"TileCpuKernel[%s], inputx0: size[%llu];"
|
||||
"inputx1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_x0->GetDataSize(), input_x1->GetDataSize(), output->GetDataSize());
|
||||
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
DataType multiples_type = ctx.Input(1)->GetDataType();
|
||||
switch (multiples_type) {
|
||||
case DT_INT32:
|
||||
switch (data_type) {
|
||||
TILE_COMPUTE_CASE_ALL(int32_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
case DT_INT64:
|
||||
switch (data_type) {
|
||||
TILE_COMPUTE_CASE_ALL(int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(multiples_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, typename M>
|
||||
void TileCpuKernel::CopyMultipleTimes(const T *in_data, int64_t in_size, M multiplier, T *out_data) {
|
||||
for (M i = 0; i < multiplier; ++i) {
|
||||
const T *in_end = in_data + in_size;
|
||||
T *new_out_data = std::copy(in_data, in_end, out_data);
|
||||
in_data = out_data;
|
||||
out_data = new_out_data;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename M>
|
||||
std::pair<int64_t, int64_t> TileCpuKernel::TileOneDimension(const std::vector<int64_t> &in_dimensions, const T *in_data,
|
||||
const M *multipliers, T *out_data, int64_t dimension) {
|
||||
if (in_dimensions.size() == 0) {
|
||||
// If input tensor is a scalar, then just copy it to output (no need to
|
||||
// multiply).
|
||||
*out_data = *in_data;
|
||||
return std::make_pair(0, 0);
|
||||
}
|
||||
|
||||
const int64_t dimension_size = in_dimensions[dimension];
|
||||
if (dimension == static_cast<int64_t>(in_dimensions.size() - 1)) {
|
||||
CopyMultipleTimes(in_data, dimension_size, multipliers[dimension], out_data);
|
||||
return std::make_pair(dimension_size, dimension_size * static_cast<int64_t>(multipliers[dimension]));
|
||||
}
|
||||
int64_t total_stride_size = 0, total_tiled_stride_size = 0;
|
||||
const T *copy_from_data = in_data;
|
||||
T *copy_to_data = out_data;
|
||||
for (int64_t i = 0; i < dimension_size; ++i) {
|
||||
int64_t stride_size = 0, tiled_stride_size = 0;
|
||||
std::tie(stride_size, tiled_stride_size) =
|
||||
TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
|
||||
copy_from_data += stride_size;
|
||||
copy_to_data += tiled_stride_size;
|
||||
total_stride_size += stride_size;
|
||||
total_tiled_stride_size += tiled_stride_size;
|
||||
}
|
||||
CopyMultipleTimes(out_data, total_tiled_stride_size, multipliers[dimension] - 1, out_data + total_tiled_stride_size);
|
||||
return std::make_pair(total_stride_size, static_cast<int64_t>(total_tiled_stride_size * multipliers[dimension]));
|
||||
}
|
||||
|
||||
template <typename T, typename M>
|
||||
uint32_t TileCpuKernel::TileCompute(CpuKernelContext &ctx) {
|
||||
auto x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto multiples = reinterpret_cast<M *>(ctx.Input(1)->GetData());
|
||||
auto y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
std::vector<int64_t> in_dimensions = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
TileOneDimension(in_dimensions, x, multiples, y, 0);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kTile, TileCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_TILE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_TILE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class TileCpuKernel : public CpuKernel {
|
||||
public:
|
||||
TileCpuKernel() = default;
|
||||
~TileCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T, typename M>
|
||||
void CopyMultipleTimes(const T *in_data, int64_t in_size, M multiplier, T *out_data);
|
||||
|
||||
template <typename T, typename M>
|
||||
std::pair<int64_t, int64_t> TileOneDimension(const std::vector<int64_t> &in_dimensions, const T *in_data,
|
||||
const M *multipliers, T *out_data, int64_t dimension);
|
||||
|
||||
template <typename T, typename M>
|
||||
uint32_t TileCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,220 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "transpose.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "securec.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kTranspose = "Transpose";
|
||||
|
||||
#define TRANSPOSE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = TransposeCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Transpose kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t TransposeCpuKernel::GetTransposeValue(Tensor *tensor, std::vector<int64_t> &value) {
|
||||
auto type = tensor->GetDataType();
|
||||
if (type == DT_INT32) {
|
||||
auto data = reinterpret_cast<int32_t *>(tensor->GetData());
|
||||
for (unsigned int i = 0; i < tensor->NumElements(); i++) {
|
||||
value.push_back(static_cast<int64_t>(*(data + i)));
|
||||
}
|
||||
} else if (type == DT_INT64) {
|
||||
auto data = reinterpret_cast<int64_t *>(tensor->GetData());
|
||||
for (unsigned int i = 0; i < tensor->NumElements(); i++) {
|
||||
value.push_back(*(data + i));
|
||||
}
|
||||
} else {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t TransposeCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kTranspose);
|
||||
KERNEL_HANDLE_ERROR(TransposeParamCheck(ctx), "[%s] check params failed.", kTranspose);
|
||||
auto x_type = ctx.Input(0)->GetDataType();
|
||||
switch (x_type) {
|
||||
TRANSPOSE_COMPUTE_CASE(DT_BOOL, bool, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
TRANSPOSE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Transpose kernel data type [%s] not support.", DTypeStr(x_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t TransposeCpuKernel::TransposeParamCheck(CpuKernelContext &ctx) {
|
||||
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_perm = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
|
||||
auto perm_tensor = ctx.Input(1);
|
||||
auto y_tensor = ctx.Output(0);
|
||||
|
||||
KERNEL_CHECK_FALSE((shape_perm.size() == 1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected perm to be 1-D tensors , but got [%zu]-D tensors.", shape_x.size())
|
||||
KERNEL_CHECK_FALSE((perm_tensor->NumElements() == (unsigned int)shape_x.size()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected the size of perm to be [%zu], but got [%zu].", shape_x.size(),
|
||||
perm_tensor->NumElements())
|
||||
KERNEL_CHECK_FALSE((GetTransposeValue(perm_tensor, perm) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"perm must be either int32 or int64, but got [%s].", DTypeStr(perm_tensor->GetDataType()).c_str())
|
||||
KERNEL_CHECK_FALSE((shape_x.size() > 1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected the dimension of x to be greater than 1-D, but got [%zu].", shape_x.size())
|
||||
|
||||
std::vector<int64_t> shape_y;
|
||||
for (size_t i = 0; i < shape_x.size(); ++i) {
|
||||
int64_t perm_value = perm.at(i);
|
||||
if (shape_x.at(i) == 0) {
|
||||
KERNEL_CHECK_FALSE((perm_value == 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected perm[%zu] == 0 (got %zu), when x shape[%zu] == 0.", i, perm_value, i)
|
||||
} else {
|
||||
KERNEL_CHECK_FALSE((0 <= perm_value && perm_value <= (unsigned int)shape_x.size() - 1),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Expected perm[%zu] in [0, %zu], but got %zu.", i, shape_x.size(),
|
||||
perm_value)
|
||||
}
|
||||
int64_t temp_value = 0;
|
||||
for (size_t j = 0; j < shape_x.size(); ++j) {
|
||||
if ((unsigned int)perm.at(j) == i) {
|
||||
break;
|
||||
} else {
|
||||
temp_value = j + 1;
|
||||
KERNEL_CHECK_FALSE((temp_value < (unsigned int)shape_x.size()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected perm value is unique.")
|
||||
}
|
||||
}
|
||||
shape_y.push_back(shape_x.at(perm_value));
|
||||
}
|
||||
y_tensor->GetTensorShape()->SetDimSizes(shape_y);
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t TransposeCpuKernel::TransposeCompute(CpuKernelContext &ctx) {
|
||||
auto x_data = ctx.Input(0)->GetData();
|
||||
auto y_data = ctx.Output(0)->GetData();
|
||||
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_y = ctx.Output(0)->GetTensorShape()->GetDimSizes();
|
||||
auto input_data = reinterpret_cast<T *>(x_data);
|
||||
auto output_data = reinterpret_cast<T *>(y_data);
|
||||
int64_t input_dims = shape_x.size();
|
||||
switch (input_dims) {
|
||||
case 2: {
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_2D;
|
||||
Eigen_Tensor_2D input_2D(input_data, shape_x.at(0), shape_x.at(1));
|
||||
Eigen_Tensor_2D output_2D(output_data, shape_y.at(0), shape_y.at(1));
|
||||
Eigen::array<Eigen::DenseIndex, 2> perm_2D;
|
||||
for (size_t i = 0; i < 2; ++i) {
|
||||
perm_2D[i] = perm.at(i);
|
||||
}
|
||||
output_2D = input_2D.shuffle(perm_2D);
|
||||
break;
|
||||
}
|
||||
case 3: {
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 3, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_3D;
|
||||
Eigen_Tensor_3D input_3D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2));
|
||||
Eigen_Tensor_3D output_3D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2));
|
||||
Eigen::array<Eigen::DenseIndex, 3> perm_3D;
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
perm_3D[i] = perm.at(i);
|
||||
}
|
||||
output_3D = input_3D.shuffle(perm_3D);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_4D;
|
||||
Eigen_Tensor_4D input_4D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3));
|
||||
Eigen_Tensor_4D output_4D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3));
|
||||
Eigen::array<Eigen::DenseIndex, 4> perm_4D;
|
||||
for (size_t i = 0; i < 4; ++i) {
|
||||
perm_4D[i] = perm.at(i);
|
||||
}
|
||||
output_4D = input_4D.shuffle(perm_4D);
|
||||
break;
|
||||
}
|
||||
case 5: {
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_5D;
|
||||
Eigen_Tensor_5D input_5D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3), shape_x.at(4));
|
||||
Eigen_Tensor_5D output_5D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3), shape_y.at(4));
|
||||
Eigen::array<Eigen::DenseIndex, 5> perm_5D;
|
||||
for (size_t i = 0; i < 5; ++i) {
|
||||
perm_5D[i] = perm.at(i);
|
||||
}
|
||||
output_5D = input_5D.shuffle(perm_5D);
|
||||
break;
|
||||
}
|
||||
case 6: {
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 6, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_6D;
|
||||
Eigen_Tensor_6D input_6D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3), shape_x.at(4),
|
||||
shape_x.at(5));
|
||||
Eigen_Tensor_6D output_6D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3), shape_y.at(4),
|
||||
shape_y.at(5));
|
||||
Eigen::array<Eigen::DenseIndex, 6> perm_6D;
|
||||
for (size_t i = 0; i < 6; ++i) {
|
||||
perm_6D[i] = perm.at(i);
|
||||
}
|
||||
output_6D = input_6D.shuffle(perm_6D);
|
||||
break;
|
||||
}
|
||||
case 7: {
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 7, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_7D;
|
||||
Eigen_Tensor_7D input_7D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3), shape_x.at(4),
|
||||
shape_x.at(5), shape_x.at(6));
|
||||
Eigen_Tensor_7D output_7D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3), shape_y.at(4),
|
||||
shape_y.at(5), shape_y.at(6));
|
||||
Eigen::array<Eigen::DenseIndex, 7> perm_7D;
|
||||
for (size_t i = 0; i < 7; ++i) {
|
||||
perm_7D[i] = perm.at(i);
|
||||
}
|
||||
output_7D = input_7D.shuffle(perm_7D);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] : Unhandled input dimensions [%zu].", kTranspose, input_dims);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kTranspose, TransposeCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_TRANSPOSE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_TRANSPOSE_H_
|
||||
|
||||
#include <vector>
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class TransposeCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~TransposeCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
std::vector<int64_t> perm;
|
||||
uint32_t TransposeParamCheck(CpuKernelContext &ctx);
|
||||
uint32_t GetTransposeValue(Tensor *tensor, std::vector<int64_t> &value);
|
||||
|
||||
template <typename T>
|
||||
uint32_t TransposeCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_TRANSPOSE_H
|
|
@ -0,0 +1,127 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tridiagonal_matmul.h"
|
||||
|
||||
#include <complex>
|
||||
#include "Eigen/Core"
|
||||
#include "Eigen/Dense"
|
||||
#include "Eigen/LU"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
constexpr uint32_t kInputNum = 4;
|
||||
constexpr uint32_t kOutputNum = 1;
|
||||
const char *kTridiagonalMatMul = "TridiagonalMatMul";
|
||||
} // namespace
|
||||
namespace aicpu {
|
||||
|
||||
uint32_t TridiagonalMatMulCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "TridiagonalMatMul check input and output num failed.");
|
||||
KERNEL_HANDLE_ERROR(TridiagonalMatMulDataAndTypeCheck(ctx),
|
||||
"TridiagonalMatMul check input and output params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT16:
|
||||
return TridiagonalMatMulCompute<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return TridiagonalMatMulCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return TridiagonalMatMulCompute<double>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return TridiagonalMatMulCompute<std::complex<float>>(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return TridiagonalMatMulCompute<std::complex<double>>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported input data type[%s]", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t TridiagonalMatMulCpuKernel::TridiagonalMatMulDataAndTypeCheck(CpuKernelContext &ctx) {
|
||||
DataType superdiag_type = ctx.Input(0)->GetDataType();
|
||||
DataType maindiag_type = ctx.Input(1)->GetDataType();
|
||||
DataType subdiag_type = ctx.Input(2)->GetDataType();
|
||||
DataType rhs_type = ctx.Input(3)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((superdiag_type == maindiag_type && maindiag_type == subdiag_type && subdiag_type == rhs_type),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s], input1 [%s],input2 [%s] and input3 [%s] "
|
||||
"need be same.",
|
||||
DTypeStr(superdiag_type).c_str(), DTypeStr(maindiag_type).c_str(), DTypeStr(subdiag_type).c_str(),
|
||||
DTypeStr(rhs_type).c_str())
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t TridiagonalMatMulCpuKernel::TridiagonalMatMulCompute(CpuKernelContext &ctx) {
|
||||
auto superdiag_tensor = ctx.Input(0);
|
||||
auto superdiag_tensor_shape = superdiag_tensor->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((IsVector(superdiag_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
|
||||
"invalid Input[superdiag]")
|
||||
auto maindiag_tensor = ctx.Input(1);
|
||||
auto maindiag_tensor_shape = maindiag_tensor->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((IsVector(maindiag_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
|
||||
"invalid Input[maindiag]")
|
||||
auto subdiag_tensor = ctx.Input(2);
|
||||
auto subdiag_tensor_shape = subdiag_tensor->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((IsVector(subdiag_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
|
||||
"invalid Input[subdiag]")
|
||||
auto rhs_tensor = ctx.Input(3);
|
||||
auto rhs_tensor_shape = rhs_tensor->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((IsMatrix(rhs_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID, "invalid Input[rhs]")
|
||||
auto superdiag_shape = superdiag_tensor_shape->GetDimSizes();
|
||||
auto maindiag_shape = maindiag_tensor_shape->GetDimSizes();
|
||||
auto subdiag_shape = subdiag_tensor_shape->GetDimSizes();
|
||||
auto rhs_shape = rhs_tensor_shape->GetDimSizes();
|
||||
int32_t superdiag_dims = superdiag_tensor_shape->GetDims();
|
||||
int32_t maindiag_dims = maindiag_tensor_shape->GetDims();
|
||||
int32_t subdiag_dims = subdiag_tensor_shape->GetDims();
|
||||
int32_t rhs_dims = rhs_tensor_shape->GetDims();
|
||||
int64_t length = rhs_shape[rhs_dims - 2];
|
||||
KERNEL_CHECK_FALSE((superdiag_shape[superdiag_dims - 1] == length), KERNEL_STATUS_PARAM_INVALID,
|
||||
"invalid Input superdiag length")
|
||||
KERNEL_CHECK_FALSE((maindiag_shape[maindiag_dims - 1] == length), KERNEL_STATUS_PARAM_INVALID,
|
||||
"invalid Input maindiag length")
|
||||
KERNEL_CHECK_FALSE((subdiag_shape[subdiag_dims - 1] == length), KERNEL_STATUS_PARAM_INVALID,
|
||||
"invalid Input subdiag length")
|
||||
using VectorMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
|
||||
using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
|
||||
VectorMap superdiag(reinterpret_cast<T *>(superdiag_tensor->GetData()), superdiag_shape[superdiag_dims - 1], 1);
|
||||
VectorMap maindiag(reinterpret_cast<T *>(maindiag_tensor->GetData()), maindiag_shape[maindiag_dims - 1], 1);
|
||||
VectorMap subdiag(reinterpret_cast<T *>(subdiag_tensor->GetData()), subdiag_shape[subdiag_dims - 1], 1);
|
||||
MatrixMap rhs(reinterpret_cast<T *>(rhs_tensor->GetData()), rhs_shape[rhs_dims - 2], rhs_shape[rhs_dims - 1]);
|
||||
auto y_tensor = ctx.Output(0);
|
||||
auto y_shape = y_tensor->GetTensorShape()->GetDimSizes();
|
||||
int32_t y_dims = y_tensor->GetTensorShape()->GetDims();
|
||||
MatrixMap y(reinterpret_cast<T *>(y_tensor->GetData()), y_shape[y_dims - 2], y_shape[y_dims - 1]);
|
||||
y.array() = rhs.array().colwise() * maindiag.array();
|
||||
for (int64_t i = 0; i < length - 1; i++) {
|
||||
y.array().row(i) += rhs.array().row(i + 1) * superdiag(i);
|
||||
y.array().row(i + 1) += rhs.array().row(i) * subdiag(i + 1);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kTridiagonalMatMul, TridiagonalMatMulCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_TRIDIAGONALMATMUL_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_TRIDIAGONALMATMUL_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class TridiagonalMatMulCpuKernel : public CpuKernel {
|
||||
public:
|
||||
TridiagonalMatMulCpuKernel() = default;
|
||||
~TridiagonalMatMulCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t TridiagonalMatMulCompute(CpuKernelContext &ctx);
|
||||
uint32_t TridiagonalMatMulDataAndTypeCheck(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_TRIDIAGONALMATMUL_H_
|
|
@ -0,0 +1,93 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tril_indices.h"
|
||||
|
||||
#include <Eigen/Dense>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kTrilIndices = "TrilIndices";
|
||||
|
||||
#define TRIL_INDICES_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = DoCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("TrilIndices kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t TrilIndicesCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
Tensor *output = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed.")
|
||||
auto data_type = ctx.Output(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
TRIL_INDICES_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
TRIL_INDICES_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("TrilIndices kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t TrilIndicesCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
AttrValue *row_ptr = ctx.GetAttr("row");
|
||||
AttrValue *col_ptr = ctx.GetAttr("col");
|
||||
AttrValue *offset_ptr = ctx.GetAttr("offset");
|
||||
int64_t row = row_ptr->GetInt();
|
||||
int64_t col = col_ptr->GetInt();
|
||||
int64_t offset = (offset_ptr == nullptr) ? 0 : (offset_ptr->GetInt());
|
||||
|
||||
auto m_first_row = offset > 0 ? std::min<int64_t>(col, 1 + offset) : row + offset > 0;
|
||||
auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset));
|
||||
auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset));
|
||||
auto n_row_trapezoid = (m_last_row - m_first_row + 1);
|
||||
auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
|
||||
auto diff_row = n_row_all - n_row_trapezoid;
|
||||
if (diff_row > 0) {
|
||||
tril_size += diff_row * col;
|
||||
}
|
||||
|
||||
T *output{static_cast<T *>(ctx.Output(0)->GetData())};
|
||||
|
||||
int64_t i = 0;
|
||||
int64_t r = std::max<int64_t>(0, -offset), c = 0;
|
||||
while (i < tril_size) {
|
||||
output[i] = r;
|
||||
output[tril_size + i++] = c;
|
||||
c += 1;
|
||||
if (c > r + offset || c >= col) {
|
||||
r += 1;
|
||||
c = 0;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kTrilIndices, TrilIndicesCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_TRIL_INDICES_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_TRIL_INDICES_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
#include "utils/sparse_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
class TrilIndicesCpuKernel : public CpuKernel {
|
||||
public:
|
||||
TrilIndicesCpuKernel() = default;
|
||||
~TrilIndicesCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
|
||||
int32_t offset = 0;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,874 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "triplet_margin_loss.h"
|
||||
|
||||
#include <Eigen/Dense>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "utils/broadcast_iterator.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 4;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const int64_t kNoBroadcastValue = 1;
|
||||
const char *kTripletMarginLoss = "TripletMarginLoss";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 28 * 1024;
|
||||
const int64_t kParallelDataNumMid = 56 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t TripletMarginLossCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
" TripletMarginLoss check input and output number failed.");
|
||||
auto data_type_x = static_cast<DataType>(ctx.Input(0)->GetDataType());
|
||||
auto data_type_positive = static_cast<DataType>(ctx.Input(1)->GetDataType());
|
||||
auto data_type_negative = static_cast<DataType>(ctx.Input(2)->GetDataType());
|
||||
if (data_type_x != data_type_negative || data_type_positive != data_type_negative ||
|
||||
data_type_x != data_type_positive) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"[%s] Data type of inputs requires to be the same, but got data type "
|
||||
"[%s] and "
|
||||
"[%s], type[%s].",
|
||||
ctx.GetOpType().c_str(), DTypeStr(data_type_x).c_str(), DTypeStr(data_type_positive).c_str(),
|
||||
DTypeStr(data_type_negative).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
AttrValue *Attr_p = ctx.GetAttr("p");
|
||||
int p_value = (Attr_p == nullptr) ? 2 : Attr_p->GetInt();
|
||||
float margin_value = *(reinterpret_cast<float *>(ctx.Input(3)->GetData()));
|
||||
AttrValue *Attr_eps = ctx.GetAttr("eps");
|
||||
float eps_value = (Attr_eps == nullptr) ? 1e-6 : Attr_eps->GetFloat();
|
||||
AttrValue *Attr_swap = ctx.GetAttr("swap");
|
||||
bool swap_value = (Attr_swap == nullptr) ? false : Attr_swap->GetBool();
|
||||
AttrValue *Attr_red = ctx.GetAttr("reduction");
|
||||
std::string reduction_value = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
|
||||
Tensor *input_x = (ctx.Input(0));
|
||||
Tensor *input_positive = (ctx.Input(1));
|
||||
Tensor *input_negative = (ctx.Input(2));
|
||||
const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
|
||||
const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
|
||||
const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> broadcast_shape;
|
||||
std::vector<int64_t> broadcast_shape_x_and_positive;
|
||||
(void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
|
||||
(void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
|
||||
int64_t num_elements = 1;
|
||||
for (size_t i = 0; i < broadcast_shape.size(); i++) {
|
||||
num_elements *= broadcast_shape[i];
|
||||
}
|
||||
int64_t data_num_output_reduction_none = (num_elements) / (broadcast_shape[1]);
|
||||
int64_t data_num_each_batch_input = (num_elements) / (broadcast_shape[0]);
|
||||
int64_t data_num_each_batch_output_reduction_none = data_num_output_reduction_none / (broadcast_shape[0]);
|
||||
int64_t batch_size = broadcast_shape[0];
|
||||
int64_t once_compute_size = broadcast_shape[1];
|
||||
bool broadcast = false;
|
||||
std::vector<int64_t> x_reshape_vector = shape_x;
|
||||
std::vector<int64_t> positive_reshape_vector = shape_positive;
|
||||
std::vector<int64_t> negative_reshape_vector = shape_negative;
|
||||
if (shape_x != shape_positive || shape_x != shape_negative || shape_positive != shape_negative) {
|
||||
broadcast = true;
|
||||
std::reverse(x_reshape_vector.begin(), x_reshape_vector.end());
|
||||
std::reverse(positive_reshape_vector.begin(), positive_reshape_vector.end());
|
||||
std::reverse(negative_reshape_vector.begin(), negative_reshape_vector.end());
|
||||
int64_t dim_num_x = input_x->GetTensorShape()->GetDims();
|
||||
int64_t dim_num_positive = input_positive->GetTensorShape()->GetDims();
|
||||
int64_t dim_num_negative = input_negative->GetTensorShape()->GetDims();
|
||||
auto dims = std::max(dim_num_x, std::max(dim_num_positive, dim_num_negative));
|
||||
if (dim_num_x < dims) x_reshape_vector.resize(dims, kNoBroadcastValue);
|
||||
if (dim_num_positive < dims) positive_reshape_vector.resize(dims, kNoBroadcastValue);
|
||||
if (dim_num_negative < dims) negative_reshape_vector.resize(dims, kNoBroadcastValue);
|
||||
std::reverse(x_reshape_vector.begin(), x_reshape_vector.end());
|
||||
std::reverse(positive_reshape_vector.begin(), positive_reshape_vector.end());
|
||||
std::reverse(negative_reshape_vector.begin(), negative_reshape_vector.end());
|
||||
}
|
||||
switch (data_type_x) {
|
||||
case DT_FLOAT16:
|
||||
return TripletMarginLossComputeRealTypeFloat16<Eigen::half>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_FLOAT:
|
||||
return TripletMarginLossComputeRealType<float>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_DOUBLE:
|
||||
return TripletMarginLossComputeRealType<double>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_INT8:
|
||||
return TripletMarginLossComputeRealType<int8_t>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_INT16:
|
||||
return TripletMarginLossComputeRealType<int16_t>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_INT32:
|
||||
return TripletMarginLossComputeRealType<int32_t>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_INT64:
|
||||
return TripletMarginLossComputeRealType<int64_t>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_UINT8:
|
||||
return TripletMarginLossComputeRealType<uint8_t>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_UINT16:
|
||||
return TripletMarginLossComputeRealType<uint16_t>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_UINT32:
|
||||
return TripletMarginLossComputeRealType<uint32_t>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_UINT64:
|
||||
return TripletMarginLossComputeRealType<uint64_t>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_COMPLEX128:
|
||||
return TripletMarginLossComputeComplexType<std::complex<double>>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
case DT_COMPLEX64:
|
||||
return TripletMarginLossComputeComplexType<std::complex<float>>(
|
||||
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
|
||||
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
|
||||
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not supported, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(data_type_x).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t TripletMarginLossCpuKernel::TripletMarginLossComputeRealType(
|
||||
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value, std::string reduction_value,
|
||||
int64_t num_elements, int64_t data_num_output_reduction_none, int64_t data_num_each_batch_input,
|
||||
int64_t data_num_each_batch_output_reduction_none, int64_t batch_size, int64_t once_compute_size, bool broadcast,
|
||||
std::vector<int64_t> x_reshape_vector, std::vector<int64_t> positive_reshape_vector,
|
||||
std::vector<int64_t> negative_reshape_vector) {
|
||||
constexpr int ADULT_AGE = 4;
|
||||
Tensor *input_x = (ctx.Input(0));
|
||||
Tensor *input_positive = (ctx.Input(1));
|
||||
Tensor *input_negative = (ctx.Input(2));
|
||||
Tensor *output = (ctx.Output(0));
|
||||
const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
|
||||
const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
|
||||
const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
|
||||
T *x_data = reinterpret_cast<T *>(input_x->GetData());
|
||||
T *positive_data = reinterpret_cast<T *>(input_positive->GetData());
|
||||
T *negative_data = reinterpret_cast<T *>(input_negative->GetData());
|
||||
std::vector<int64_t> broadcast_shape;
|
||||
std::vector<int64_t> broadcast_shape_x_and_positive;
|
||||
(void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
|
||||
(void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
|
||||
std::vector<T> x_broadcast_tensor;
|
||||
std::vector<T> positive_broadcast_tensor;
|
||||
std::vector<T> negative_broadcast_tensor;
|
||||
if (broadcast == true) {
|
||||
auto shape_x1 = shape_x;
|
||||
auto shape_x2 = shape_x;
|
||||
auto shape_positive1 = shape_positive;
|
||||
auto shape_negative1 = shape_negative;
|
||||
auto broadcast_shape1 = broadcast_shape;
|
||||
auto broadcast_shape2 = broadcast_shape;
|
||||
BroadcastIterator iter1(shape_x1, shape_positive1, broadcast_shape1);
|
||||
BroadcastIterator iter2(shape_x2, shape_negative1, broadcast_shape2);
|
||||
iter1.SetPos(0);
|
||||
iter2.SetPos(0);
|
||||
for (int64_t i = 0; i < num_elements; i++) {
|
||||
x_broadcast_tensor.push_back(x_data[iter1.GetInputPosA()]);
|
||||
positive_broadcast_tensor.push_back(positive_data[iter1.GetInputPosB()]);
|
||||
negative_broadcast_tensor.push_back(negative_data[iter2.GetInputPosB()]);
|
||||
iter1.GenNextPos();
|
||||
iter2.GenNextPos();
|
||||
}
|
||||
x_data = x_broadcast_tensor.data();
|
||||
positive_data = positive_broadcast_tensor.data();
|
||||
negative_data = negative_broadcast_tensor.data();
|
||||
}
|
||||
auto output_data = reinterpret_cast<float *>(output->GetData());
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> output_reduction_none(data_num_output_reduction_none, 1);
|
||||
float *output_reduction_none_data = reinterpret_cast<float *>(output_reduction_none.data());
|
||||
auto shard_triplet_margin_loss = [&](int64_t start, int64_t end) {
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
|
||||
float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
|
||||
float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
|
||||
float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
|
||||
int64_t once_compute_thread_size = (end - start);
|
||||
float positive_distance;
|
||||
float negative_distance;
|
||||
float swap_distance;
|
||||
float temp1;
|
||||
float temp2;
|
||||
float temp3;
|
||||
if (data_num_each_batch_input == 0) {
|
||||
KERNEL_LOG_ERROR("data_num_each_batch_input could not be 0.");
|
||||
}
|
||||
for (int64_t n = 0; n < once_compute_thread_size / data_num_each_batch_input; n++) {
|
||||
int64_t i = start / data_num_each_batch_input;
|
||||
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
|
||||
for (int64_t k = 0; k < once_compute_size; k++) {
|
||||
*(calculate_positive_distance_data + k) =
|
||||
eps_value +
|
||||
static_cast<float>(
|
||||
*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(
|
||||
*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
*(calculate_negative_distance_data + k) =
|
||||
eps_value +
|
||||
static_cast<float>(
|
||||
*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(
|
||||
*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
if (swap_value == true) {
|
||||
*(calculate_swap_distance_data + k) =
|
||||
eps_value +
|
||||
static_cast<float>(
|
||||
*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(
|
||||
*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
}
|
||||
}
|
||||
calculate_positive_distance = (calculate_positive_distance).abs();
|
||||
calculate_negative_distance = (calculate_negative_distance).abs();
|
||||
for (int64_t n = 0; n < once_compute_size; n++) {
|
||||
temp1 = *(calculate_positive_distance_data + n);
|
||||
temp2 = *(calculate_negative_distance_data + n);
|
||||
for (int64_t l = 1; l < p_value; l++) {
|
||||
*(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
|
||||
*(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
|
||||
}
|
||||
}
|
||||
positive_distance =
|
||||
std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value)));
|
||||
negative_distance =
|
||||
std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value)));
|
||||
if (broadcast == true) {
|
||||
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
}
|
||||
if (swap_value == true) {
|
||||
calculate_swap_distance = ((calculate_swap_distance)).abs();
|
||||
for (int64_t n = 0; n < once_compute_size; n++) {
|
||||
temp3 = *(calculate_swap_distance_data + n);
|
||||
for (int64_t l = 1; l < p_value; l++) {
|
||||
*(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
|
||||
}
|
||||
}
|
||||
swap_distance =
|
||||
std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value)));
|
||||
if (broadcast == true) {
|
||||
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
}
|
||||
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
|
||||
}
|
||||
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
|
||||
(positive_distance + margin_value - negative_distance > 0)
|
||||
? (positive_distance + margin_value - negative_distance)
|
||||
: 0;
|
||||
}
|
||||
start += data_num_each_batch_input;
|
||||
}
|
||||
};
|
||||
if (num_elements * sizeof(T) > kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (num_elements * sizeof(T) <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
CpuKernelUtils::ParallelFor(ctx, num_elements,
|
||||
data_num_each_batch_input * ADULT_AGE * (batch_size / max_core_num + 1),
|
||||
shard_triplet_margin_loss);
|
||||
} else {
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
|
||||
float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
|
||||
float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
|
||||
float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
|
||||
float positive_distance;
|
||||
float negative_distance;
|
||||
float swap_distance;
|
||||
float temp1;
|
||||
float temp2;
|
||||
float temp3;
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
|
||||
for (int64_t k = 0; k < once_compute_size; k++) {
|
||||
*(calculate_positive_distance_data + k) =
|
||||
eps_value +
|
||||
static_cast<float>(
|
||||
*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(
|
||||
*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
*(calculate_negative_distance_data + k) =
|
||||
eps_value +
|
||||
static_cast<float>(
|
||||
*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(
|
||||
*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
if (swap_value == true) {
|
||||
*(calculate_swap_distance_data + k) =
|
||||
eps_value +
|
||||
static_cast<float>(
|
||||
*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(
|
||||
*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
}
|
||||
}
|
||||
calculate_positive_distance = (calculate_positive_distance).abs();
|
||||
calculate_negative_distance = (calculate_negative_distance).abs();
|
||||
for (int64_t n = 0; n < once_compute_size; n++) {
|
||||
temp1 = *(calculate_positive_distance_data + n);
|
||||
temp2 = *(calculate_negative_distance_data + n);
|
||||
for (int64_t l = 1; l < p_value; l++) {
|
||||
*(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
|
||||
*(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
|
||||
}
|
||||
}
|
||||
positive_distance =
|
||||
std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value)));
|
||||
negative_distance =
|
||||
std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value)));
|
||||
if (broadcast == true) {
|
||||
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
}
|
||||
if (swap_value == true) {
|
||||
calculate_swap_distance = ((calculate_swap_distance)).abs();
|
||||
for (int64_t n = 0; n < once_compute_size; n++) {
|
||||
temp3 = *(calculate_swap_distance_data + n);
|
||||
for (int64_t l = 1; l < p_value; l++) {
|
||||
*(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
|
||||
}
|
||||
}
|
||||
swap_distance =
|
||||
std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value)));
|
||||
if (broadcast == true) {
|
||||
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
}
|
||||
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
|
||||
}
|
||||
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
|
||||
(positive_distance + margin_value - negative_distance > 0)
|
||||
? (positive_distance + margin_value - negative_distance)
|
||||
: 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (reduction_value == "none") {
|
||||
for (int64_t i = 0; i < data_num_output_reduction_none; i++) {
|
||||
*(output_data + i) = *(output_reduction_none_data + i);
|
||||
}
|
||||
}
|
||||
if (reduction_value == "mean") {
|
||||
*(output_data) = (output_reduction_none.mean());
|
||||
}
|
||||
if (reduction_value == "sum") {
|
||||
*(output_data) = (output_reduction_none.sum());
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t TripletMarginLossCpuKernel::TripletMarginLossComputeComplexType(
|
||||
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value, std::string reduction_value,
|
||||
int64_t num_elements, int64_t data_num_output_reduction_none, int64_t data_num_each_batch_input,
|
||||
int64_t data_num_each_batch_output_reduction_none, int64_t batch_size, int64_t once_compute_size, bool broadcast,
|
||||
std::vector<int64_t> x_reshape_vector, std::vector<int64_t> positive_reshape_vector,
|
||||
std::vector<int64_t> negative_reshape_vector) {
|
||||
constexpr int ADULT_AGE = 4;
|
||||
Tensor *input_x = (ctx.Input(0));
|
||||
Tensor *input_positive = (ctx.Input(1));
|
||||
Tensor *input_negative = (ctx.Input(2));
|
||||
Tensor *output = (ctx.Output(0));
|
||||
const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
|
||||
const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
|
||||
const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
|
||||
T *x_data = reinterpret_cast<T *>(input_x->GetData());
|
||||
T *positive_data = reinterpret_cast<T *>(input_positive->GetData());
|
||||
T *negative_data = reinterpret_cast<T *>(input_negative->GetData());
|
||||
std::vector<int64_t> broadcast_shape;
|
||||
std::vector<int64_t> broadcast_shape_x_and_positive;
|
||||
(void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
|
||||
(void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
|
||||
std::vector<T> x_broadcast_tensor;
|
||||
std::vector<T> positive_broadcast_tensor;
|
||||
std::vector<T> negative_broadcast_tensor;
|
||||
if (broadcast == true) {
|
||||
auto shape_x1 = shape_x;
|
||||
auto shape_x2 = shape_x;
|
||||
auto shape_positive1 = shape_positive;
|
||||
auto shape_negative1 = shape_negative;
|
||||
auto broadcast_shape1 = broadcast_shape;
|
||||
auto broadcast_shape2 = broadcast_shape;
|
||||
BroadcastIterator iter1(shape_x1, shape_positive1, broadcast_shape1);
|
||||
BroadcastIterator iter2(shape_x2, shape_negative1, broadcast_shape2);
|
||||
iter1.SetPos(0);
|
||||
iter2.SetPos(0);
|
||||
for (int64_t i = 0; i < num_elements; i++) {
|
||||
x_broadcast_tensor.push_back(x_data[iter1.GetInputPosA()]);
|
||||
positive_broadcast_tensor.push_back(positive_data[iter1.GetInputPosB()]);
|
||||
negative_broadcast_tensor.push_back(negative_data[iter2.GetInputPosB()]);
|
||||
iter1.GenNextPos();
|
||||
iter2.GenNextPos();
|
||||
}
|
||||
x_data = x_broadcast_tensor.data();
|
||||
positive_data = positive_broadcast_tensor.data();
|
||||
negative_data = negative_broadcast_tensor.data();
|
||||
}
|
||||
auto output_data = reinterpret_cast<float *>(output->GetData());
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> output_reduction_none(data_num_output_reduction_none, 1);
|
||||
float *output_reduction_none_data = reinterpret_cast<float *>(output_reduction_none.data());
|
||||
auto shard_triplet_margin_loss = [&](int64_t start, int64_t end) {
|
||||
Eigen::Array<T, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
|
||||
Eigen::Array<T, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
|
||||
Eigen::Array<T, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
|
||||
T *calculate_positive_distance_data = reinterpret_cast<T *>(calculate_positive_distance.data());
|
||||
T *calculate_negative_distance_data = reinterpret_cast<T *>(calculate_negative_distance.data());
|
||||
T *calculate_swap_distance_data = reinterpret_cast<T *>(calculate_swap_distance.data());
|
||||
int64_t once_compute_thread_size = end - start;
|
||||
float positive_distance;
|
||||
float negative_distance;
|
||||
float swap_distance;
|
||||
if (data_num_each_batch_input == 0) {
|
||||
KERNEL_LOG_ERROR("data_num_each_batch_input could not be 0.");
|
||||
}
|
||||
for (int64_t n = 0; n < (once_compute_thread_size) / data_num_each_batch_input; n++) {
|
||||
int64_t i = start / data_num_each_batch_input;
|
||||
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
|
||||
for (int64_t k = 0; k < once_compute_size; k++) {
|
||||
*(calculate_positive_distance_data + k) =
|
||||
static_cast<T>(eps_value) +
|
||||
(*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
(*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
*(calculate_negative_distance_data + k) =
|
||||
static_cast<T>(eps_value) +
|
||||
(*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
(*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
if (swap_value == true) {
|
||||
*(calculate_swap_distance_data + k) =
|
||||
static_cast<T>(eps_value) +
|
||||
(*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
(*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
}
|
||||
}
|
||||
auto calculate_positive_distance_float =
|
||||
(calculate_positive_distance * (calculate_positive_distance.matrix().conjugate().array())).real().sqrt();
|
||||
auto calculate_negative_distance_float =
|
||||
(calculate_negative_distance * (calculate_negative_distance.matrix().conjugate().array())).real().sqrt();
|
||||
positive_distance =
|
||||
std::pow(calculate_positive_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
|
||||
negative_distance =
|
||||
std::pow(calculate_negative_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
|
||||
if (broadcast == true) {
|
||||
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
}
|
||||
if (swap_value == true) {
|
||||
auto calculate_swap_distance_float =
|
||||
(calculate_swap_distance * (calculate_swap_distance.matrix().conjugate().array())).real().sqrt();
|
||||
swap_distance = std::pow(calculate_swap_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
|
||||
if (broadcast == true) {
|
||||
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
}
|
||||
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
|
||||
}
|
||||
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
|
||||
(positive_distance + margin_value - negative_distance > 0)
|
||||
? (positive_distance + margin_value - negative_distance)
|
||||
: 0;
|
||||
}
|
||||
start += data_num_each_batch_input;
|
||||
}
|
||||
};
|
||||
if (num_elements * sizeof(T) > kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (num_elements * sizeof(T) <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
CpuKernelUtils::ParallelFor(ctx, num_elements,
|
||||
data_num_each_batch_input * ADULT_AGE * (batch_size / max_core_num + 1),
|
||||
shard_triplet_margin_loss);
|
||||
} else {
|
||||
Eigen::Array<T, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
|
||||
Eigen::Array<T, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
|
||||
Eigen::Array<T, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
|
||||
T *calculate_positive_distance_data = reinterpret_cast<T *>(calculate_positive_distance.data());
|
||||
T *calculate_negative_distance_data = reinterpret_cast<T *>(calculate_negative_distance.data());
|
||||
T *calculate_swap_distance_data = reinterpret_cast<T *>(calculate_swap_distance.data());
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
|
||||
for (int64_t k = 0; k < once_compute_size; k++) {
|
||||
*(calculate_positive_distance_data + k) =
|
||||
static_cast<T>(eps_value) +
|
||||
(*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
(*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
*(calculate_negative_distance_data + k) =
|
||||
static_cast<T>(eps_value) +
|
||||
(*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
(*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
if (swap_value == true) {
|
||||
*(calculate_swap_distance_data + k) =
|
||||
static_cast<T>(eps_value) +
|
||||
(*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
|
||||
(*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
|
||||
}
|
||||
}
|
||||
float positive_distance;
|
||||
float negative_distance;
|
||||
float swap_distance;
|
||||
auto calculate_positive_distance_float =
|
||||
(calculate_positive_distance * (calculate_positive_distance.matrix().conjugate().array())).real().sqrt();
|
||||
auto calculate_negative_distance_float =
|
||||
(calculate_negative_distance * (calculate_negative_distance.matrix().conjugate().array())).real().sqrt();
|
||||
positive_distance =
|
||||
std::pow(calculate_positive_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
|
||||
negative_distance =
|
||||
std::pow(calculate_negative_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
|
||||
if (broadcast == true) {
|
||||
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
}
|
||||
if (swap_value == true) {
|
||||
auto calculate_swap_distance_float =
|
||||
(calculate_swap_distance * (calculate_swap_distance.matrix().conjugate().array())).real().sqrt();
|
||||
swap_distance = std::pow(calculate_swap_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
|
||||
if (broadcast == true) {
|
||||
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
}
|
||||
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
|
||||
}
|
||||
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
|
||||
(positive_distance + margin_value - negative_distance > 0)
|
||||
? positive_distance + margin_value - negative_distance
|
||||
: 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (reduction_value == "none") {
|
||||
for (int64_t i = 0; i < data_num_output_reduction_none; i++) {
|
||||
*(output_data + i) = *(output_reduction_none_data + i);
|
||||
}
|
||||
}
|
||||
if (reduction_value == "mean") {
|
||||
*(output_data) = (output_reduction_none.mean());
|
||||
}
|
||||
if (reduction_value == "sum") {
|
||||
*(output_data) = (output_reduction_none.sum());
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t TripletMarginLossCpuKernel::TripletMarginLossComputeRealTypeFloat16(
|
||||
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value, std::string reduction_value,
|
||||
int64_t num_elements, int64_t data_num_output_reduction_none, int64_t data_num_each_batch_input,
|
||||
int64_t data_num_each_batch_output_reduction_none, int64_t batch_size, int64_t once_compute_size, bool broadcast,
|
||||
std::vector<int64_t> x_reshape_vector, std::vector<int64_t> positive_reshape_vector,
|
||||
std::vector<int64_t> negative_reshape_vector) {
|
||||
constexpr int ADULT_AGE = 4;
|
||||
Tensor *input_x = (ctx.Input(0));
|
||||
Tensor *input_positive = (ctx.Input(1));
|
||||
Tensor *input_negative = (ctx.Input(2));
|
||||
Tensor *output = (ctx.Output(0));
|
||||
const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
|
||||
const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
|
||||
const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
|
||||
T *x_data = reinterpret_cast<T *>(input_x->GetData());
|
||||
T *positive_data = reinterpret_cast<T *>(input_positive->GetData());
|
||||
T *negative_data = reinterpret_cast<T *>(input_negative->GetData());
|
||||
std::vector<int64_t> broadcast_shape;
|
||||
std::vector<int64_t> broadcast_shape_x_and_positive;
|
||||
(void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
|
||||
(void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
|
||||
std::vector<T> x_broadcast_tensor;
|
||||
std::vector<T> positive_broadcast_tensor;
|
||||
std::vector<T> negative_broadcast_tensor;
|
||||
if (broadcast == true) {
|
||||
auto shape_x1 = shape_x;
|
||||
auto shape_x2 = shape_x;
|
||||
auto shape_positive1 = shape_positive;
|
||||
auto shape_negative1 = shape_negative;
|
||||
auto broadcast_shape1 = broadcast_shape;
|
||||
auto broadcast_shape2 = broadcast_shape;
|
||||
BroadcastIterator iter1(shape_x1, shape_positive1, broadcast_shape1);
|
||||
BroadcastIterator iter2(shape_x2, shape_negative1, broadcast_shape2);
|
||||
iter1.SetPos(0);
|
||||
iter2.SetPos(0);
|
||||
for (int64_t i = 0; i < num_elements; i++) {
|
||||
x_broadcast_tensor.push_back(x_data[iter1.GetInputPosA()]);
|
||||
positive_broadcast_tensor.push_back(positive_data[iter1.GetInputPosB()]);
|
||||
negative_broadcast_tensor.push_back(negative_data[iter2.GetInputPosB()]);
|
||||
iter1.GenNextPos();
|
||||
iter2.GenNextPos();
|
||||
}
|
||||
x_data = x_broadcast_tensor.data();
|
||||
positive_data = positive_broadcast_tensor.data();
|
||||
negative_data = negative_broadcast_tensor.data();
|
||||
}
|
||||
auto output_data = reinterpret_cast<T *>(output->GetData());
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> output_reduction_none(data_num_output_reduction_none, 1);
|
||||
float *output_reduction_none_data = reinterpret_cast<float *>(output_reduction_none.data());
|
||||
auto shard_triplet_margin_loss = [&](int64_t start, int64_t end) {
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
|
||||
float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
|
||||
float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
|
||||
float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
|
||||
int64_t once_compute_thread_size = end - start;
|
||||
float positive_distance;
|
||||
float negative_distance;
|
||||
float swap_distance;
|
||||
float temp1;
|
||||
float temp2;
|
||||
float temp3;
|
||||
if (data_num_each_batch_input == 0) {
|
||||
KERNEL_LOG_ERROR("data_num_each_batch_input could not be 0.");
|
||||
}
|
||||
for (int64_t n = 0; n < (once_compute_thread_size) / data_num_each_batch_input; n++) {
|
||||
int64_t i = start / data_num_each_batch_input;
|
||||
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
|
||||
for (int64_t k = 0; k < once_compute_size; k++) {
|
||||
*(calculate_positive_distance_data + k) =
|
||||
eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)));
|
||||
*(calculate_negative_distance_data + k) =
|
||||
eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)));
|
||||
if (swap_value == true) {
|
||||
*(calculate_swap_distance_data + k) =
|
||||
eps_value + (static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)));
|
||||
}
|
||||
}
|
||||
calculate_positive_distance = (calculate_positive_distance).abs();
|
||||
calculate_negative_distance = (calculate_negative_distance).abs();
|
||||
for (int64_t n = 0; n < once_compute_size; n++) {
|
||||
temp1 = *(calculate_positive_distance_data + n);
|
||||
temp2 = *(calculate_negative_distance_data + n);
|
||||
for (int64_t l = 1; l < p_value; l++) {
|
||||
*(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
|
||||
*(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
|
||||
}
|
||||
}
|
||||
positive_distance = static_cast<float>(
|
||||
std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value))));
|
||||
negative_distance = static_cast<float>(
|
||||
std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value))));
|
||||
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
if (swap_value == true) {
|
||||
calculate_swap_distance = ((calculate_swap_distance)).abs();
|
||||
for (int64_t n = 0; n < once_compute_size; n++) {
|
||||
temp3 = *(calculate_swap_distance_data + n);
|
||||
for (int64_t l = 1; l < p_value; l++) {
|
||||
*(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
|
||||
}
|
||||
}
|
||||
swap_distance = static_cast<float>(
|
||||
std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value))));
|
||||
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
|
||||
}
|
||||
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
|
||||
(positive_distance + margin_value - negative_distance > static_cast<float>(0))
|
||||
? ((positive_distance + margin_value - negative_distance))
|
||||
: static_cast<float>(0);
|
||||
}
|
||||
start += data_num_each_batch_input;
|
||||
}
|
||||
};
|
||||
if (num_elements * sizeof(T) > kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (num_elements * sizeof(T) <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
CpuKernelUtils::ParallelFor(ctx, num_elements,
|
||||
data_num_each_batch_input * ADULT_AGE * (batch_size / max_core_num + 1),
|
||||
shard_triplet_margin_loss);
|
||||
} else {
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
|
||||
float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
|
||||
float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
|
||||
float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
|
||||
float positive_distance;
|
||||
float negative_distance;
|
||||
float swap_distance;
|
||||
for (int64_t k = 0; k < once_compute_size; k++) {
|
||||
*(calculate_positive_distance_data + k) =
|
||||
eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)));
|
||||
*(calculate_negative_distance_data + k) =
|
||||
eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)));
|
||||
if (swap_value == true) {
|
||||
*(calculate_swap_distance_data + k) =
|
||||
eps_value + (static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)) -
|
||||
static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
|
||||
k * data_num_each_batch_output_reduction_none)));
|
||||
}
|
||||
}
|
||||
calculate_positive_distance = (calculate_positive_distance).abs();
|
||||
calculate_negative_distance = (calculate_negative_distance).abs();
|
||||
float temp1;
|
||||
float temp2;
|
||||
float temp3;
|
||||
for (int64_t n = 0; n < once_compute_size; n++) {
|
||||
temp1 = *(calculate_positive_distance_data + n);
|
||||
temp2 = *(calculate_negative_distance_data + n);
|
||||
for (int64_t l = 1; l < p_value; l++) {
|
||||
*(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
|
||||
*(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
|
||||
}
|
||||
}
|
||||
positive_distance = static_cast<float>(
|
||||
std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value))));
|
||||
negative_distance = static_cast<float>(
|
||||
std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value))));
|
||||
if (broadcast == true) {
|
||||
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
}
|
||||
if (swap_value == true) {
|
||||
calculate_swap_distance = ((calculate_swap_distance)).abs();
|
||||
for (int64_t n = 0; n < once_compute_size; n++) {
|
||||
temp3 = *(calculate_swap_distance_data + n);
|
||||
for (int64_t l = 1; l < p_value; l++) {
|
||||
*(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
|
||||
}
|
||||
}
|
||||
swap_distance = static_cast<float>(
|
||||
std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value))));
|
||||
if (broadcast == true) {
|
||||
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
|
||||
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
|
||||
}
|
||||
}
|
||||
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
|
||||
}
|
||||
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
|
||||
(positive_distance + margin_value - negative_distance > static_cast<float>(0))
|
||||
? ((positive_distance + margin_value - negative_distance))
|
||||
: static_cast<float>(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (reduction_value == "none") {
|
||||
for (int64_t i = 0; i < data_num_output_reduction_none; i++) {
|
||||
*(output_data + i) = static_cast<T>(*(output_reduction_none_data + i));
|
||||
}
|
||||
}
|
||||
if (reduction_value == "mean") {
|
||||
*(output_data) = static_cast<T>(output_reduction_none.mean());
|
||||
}
|
||||
if (reduction_value == "sum") {
|
||||
*(output_data) = static_cast<T>(output_reduction_none.sum());
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kTripletMarginLoss, TripletMarginLossCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,57 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_TRIPLET_MARGIN_LOSS_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_TRIPLET_MARGIN_LOSS_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class TripletMarginLossCpuKernel : public CpuKernel {
|
||||
public:
|
||||
TripletMarginLossCpuKernel() = default;
|
||||
~TripletMarginLossCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static uint32_t TripletMarginLossComputeRealType(
|
||||
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value,
|
||||
std::string reduction_value, int64_t num_elements, int64_t data_num_output_reduction_none,
|
||||
int64_t data_num_each_batch_input, int64_t data_num_each_batch_output_reduction_none, int64_t batch_size,
|
||||
int64_t once_compute_size, bool broadcast, std::vector<int64_t> x_reshape_vector,
|
||||
std::vector<int64_t> positive_reshape_vector, std::vector<int64_t> negative_reshape_vector);
|
||||
|
||||
template <typename T>
|
||||
static uint32_t TripletMarginLossComputeRealTypeFloat16(
|
||||
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value,
|
||||
std::string reduction_value, int64_t num_elements, int64_t data_num_output_reduction_none,
|
||||
int64_t data_num_each_batch_input, int64_t data_num_each_batch_output_reduction_none, int64_t batch_size,
|
||||
int64_t once_compute_size, bool broadcast, std::vector<int64_t> x_reshape_vector,
|
||||
std::vector<int64_t> positive_reshape_vector, std::vector<int64_t> negative_reshape_vector);
|
||||
|
||||
template <typename T>
|
||||
static uint32_t TripletMarginLossComputeComplexType(
|
||||
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value,
|
||||
std::string reduction_value, int64_t num_elements, int64_t data_num_output_reduction_none,
|
||||
int64_t data_num_each_batch_input, int64_t data_num_each_batch_output_reduction_none, int64_t batch_size,
|
||||
int64_t once_compute_size, bool broadcast, std::vector<int64_t> x_reshape_vector,
|
||||
std::vector<int64_t> positive_reshape_vector, std::vector<int64_t> negative_reshape_vector);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // namespace aicpu
|
|
@ -0,0 +1,95 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "triu_indices.h"
|
||||
|
||||
#include <Eigen/Dense>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kTriuIndices = "TriuIndices";
|
||||
|
||||
#define TRIU_INDICES_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = DoCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("TriuIndices kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t TriuIndicesCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
Tensor *output = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed.")
|
||||
auto data_type = ctx.Output(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
TRIU_INDICES_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
TRIU_INDICES_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("TriuIndices kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t TriuIndicesCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
AttrValue *row_ptr = ctx.GetAttr("row");
|
||||
AttrValue *col_ptr = ctx.GetAttr("col");
|
||||
AttrValue *offset_ptr = ctx.GetAttr("offset");
|
||||
int64_t row = row_ptr->GetInt();
|
||||
int64_t col = col_ptr->GetInt();
|
||||
int64_t offset = (offset_ptr == nullptr) ? 0 : (offset_ptr->GetInt());
|
||||
|
||||
auto offset1 = offset - 1;
|
||||
auto m_first_row = offset1 > 0 ? std::min<int64_t>(col, 1 + offset1) : row + offset1 > 0;
|
||||
auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset1));
|
||||
auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset1));
|
||||
auto n_row_trapezoid = (m_last_row - m_first_row + 1);
|
||||
auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
|
||||
auto diff_row = n_row_all - n_row_trapezoid;
|
||||
if (diff_row > 0) {
|
||||
tril_size += diff_row * col;
|
||||
}
|
||||
auto triu_size = row * col - tril_size;
|
||||
|
||||
T *output{static_cast<T *>(ctx.Output(0)->GetData())};
|
||||
|
||||
int64_t i = 0;
|
||||
int64_t c = std::max<int64_t>(0, offset), r = 0;
|
||||
while (i < triu_size) {
|
||||
output[i] = r;
|
||||
output[triu_size + i++] = c;
|
||||
c += 1;
|
||||
if (c >= col) {
|
||||
r += 1;
|
||||
c = std::max<int64_t>(0, r + offset);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kTriuIndices, TriuIndicesCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_TRIU_INDICES_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_TRIU_INDICES_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
#include "utils/sparse_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
class TriuIndicesCpuKernel : public CpuKernel {
|
||||
public:
|
||||
TriuIndicesCpuKernel() = default;
|
||||
~TriuIndicesCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
|
||||
int32_t offset = 0;
|
||||
int32_t offset1 = 0;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,209 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "unpack.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kUnpack = "Unpack";
|
||||
}
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t UnpackCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
|
||||
Tensor *value_ptr = ctx.Input(0);
|
||||
KERNEL_CHECK_NULLPTR(value_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input value failed.");
|
||||
value_data_ptr = value_ptr->GetData();
|
||||
KERNEL_CHECK_NULLPTR(value_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input value data failed.");
|
||||
auto value_shape_ptr = value_ptr->GetTensorShape();
|
||||
KERNEL_CHECK_NULLPTR(value_shape_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input value shape failed.");
|
||||
int64_t value_dim = value_shape_ptr->GetDims();
|
||||
|
||||
AttrValue *unpack_axis_ptr = ctx.GetAttr("axis");
|
||||
int64_t real_unpack_axis = 0;
|
||||
KERNEL_CHECK_FALSE(unpack_axis_ptr, KERNEL_STATUS_PARAM_INVALID, "get axis failed!");
|
||||
unpack_axis = unpack_axis_ptr->GetInt();
|
||||
real_unpack_axis = unpack_axis >= 0 ? unpack_axis : unpack_axis + value_dim;
|
||||
KERNEL_CHECK_FALSE(value_dim > real_unpack_axis, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The axis value range should be [-value_dim, value_dim), "
|
||||
"value dim is [%d], axis is [%d].",
|
||||
value_dim, unpack_axis);
|
||||
unpack_axis = real_unpack_axis;
|
||||
|
||||
AttrValue *unpack_num_ptr = ctx.GetAttr("num");
|
||||
KERNEL_CHECK_FALSE(unpack_num_ptr, KERNEL_STATUS_PARAM_INVALID, "get num failed!");
|
||||
int64_t axis_size = value_shape_ptr->GetDimSize(unpack_axis);
|
||||
unpack_num = unpack_num_ptr->GetInt();
|
||||
KERNEL_CHECK_FALSE(unpack_num == axis_size, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The num you want to unpack to should be equal to the "
|
||||
"size of the specified dimension. "
|
||||
"The num you want to unpack to is [%d], while the [%d] "
|
||||
"dim's size is [%d].",
|
||||
unpack_num, unpack_axis, axis_size);
|
||||
value_shape_vec = value_shape_ptr->GetDimSizes();
|
||||
data_type = value_ptr->GetDataType();
|
||||
value_num = value_ptr->NumElements();
|
||||
|
||||
output_ptr_vec.resize(unpack_num);
|
||||
for (int64_t i = 0; i < unpack_num; i++) {
|
||||
Tensor *output_ptr = ctx.Output(i);
|
||||
KERNEL_CHECK_NULLPTR(output_ptr, KERNEL_STATUS_PARAM_INVALID, "Get output [%d] failed.", i);
|
||||
auto output_data_ptr = output_ptr->GetData();
|
||||
KERNEL_CHECK_NULLPTR(output_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get output data [%d] failed.", i);
|
||||
output_ptr_vec[i] = output_data_ptr;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t UnpackCpuKernel::UnpackWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec) {
|
||||
int64_t copy_size = value_num * sizeof(T);
|
||||
auto mem_ret = memcpy_s(output_data_vec[0], copy_size, input_data_ptr, copy_size);
|
||||
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Memcpy size[%zu] from input value to output[0] failed.", copy_size);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t UnpackCpuKernel::UnpackWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec) {
|
||||
if (value_shape_vec[0] == 0) {
|
||||
KERNEL_CHECK_FALSE(value_shape_vec[0] > 0, KERNEL_STATUS_PARAM_INVALID, "The shape of input tensor is invalid.");
|
||||
}
|
||||
int64_t copy_num = value_num / value_shape_vec[0];
|
||||
T *input_copy_ptr = input_data_ptr;
|
||||
for (int64_t i = 0; i < unpack_num; i++) {
|
||||
int64_t copy_size_per = copy_num;
|
||||
int64_t copy_size = copy_size_per * sizeof(T);
|
||||
auto mem_ret = memcpy_s(output_data_vec[i], copy_size, input_copy_ptr, copy_size);
|
||||
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
|
||||
input_copy_ptr += copy_size_per;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t UnpackCpuKernel::UnpackCompute(T *input_data_ptr, std::vector<T *> output_data_vec, CpuKernelContext &ctx) {
|
||||
int64_t prefix = 1;
|
||||
for (uint64_t i = 0; i < unpack_axis; i++) {
|
||||
if (value_shape_vec[i] == 0) {
|
||||
KERNEL_CHECK_FALSE(value_shape_vec[i] > 0, KERNEL_STATUS_PARAM_INVALID, "The shape of input tensor is invalid.");
|
||||
}
|
||||
prefix *= value_shape_vec[i];
|
||||
}
|
||||
if (unpack_axis >= value_shape_vec.size()) {
|
||||
KERNEL_CHECK_FALSE(unpack_axis < value_shape_vec.size(), KERNEL_STATUS_PARAM_INVALID,
|
||||
"input attr axis is invalid.");
|
||||
}
|
||||
int64_t midfix = value_shape_vec[unpack_axis];
|
||||
int64_t subfix = 1;
|
||||
for (size_t i = unpack_axis + 1; i < value_shape_vec.size(); i++) {
|
||||
if (value_shape_vec[i] == 0) {
|
||||
KERNEL_CHECK_FALSE(value_shape_vec[i] > 0, KERNEL_STATUS_PARAM_INVALID, "The shape of input tensor is invalid.");
|
||||
}
|
||||
subfix *= value_shape_vec[i];
|
||||
}
|
||||
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > unpack_num) {
|
||||
max_core_num = unpack_num;
|
||||
}
|
||||
|
||||
auto shard_unpack = [&](size_t start, size_t end) {
|
||||
int64_t offset = 0;
|
||||
for (uint64_t i = start; i < end; i++) {
|
||||
offset = i * subfix;
|
||||
T *output_data_ptr = output_data_vec[i];
|
||||
T *input_copy_ptr = input_data_ptr + offset;
|
||||
int64_t copy_size = subfix * sizeof(T);
|
||||
for (int64_t j = 0; j < prefix; j++) {
|
||||
auto mem_ret = memcpy_s(output_data_ptr, copy_size, input_copy_ptr, copy_size);
|
||||
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
|
||||
input_copy_ptr += (subfix * midfix);
|
||||
output_data_ptr += subfix;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, unpack_num, unpack_num / max_core_num, shard_unpack),
|
||||
"Unpack Compute failed.");
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t UnpackCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
T *input_data_ptr = reinterpret_cast<T *>(value_data_ptr);
|
||||
std::vector<T *> output_data_vec;
|
||||
output_data_vec.resize(unpack_num);
|
||||
for (int64_t i = 0; i < unpack_num; i++) {
|
||||
output_data_vec[i] = reinterpret_cast<T *>(output_ptr_vec[i]);
|
||||
}
|
||||
if (unpack_num == 1) {
|
||||
KERNEL_CHECK_FALSE((UnpackWithOneOutput<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
|
||||
KERNEL_STATUS_PARAM_INVALID, "UnpackWithOneOutput failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
if (unpack_axis == 0) {
|
||||
KERNEL_CHECK_FALSE((UnpackWithDimZero<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
|
||||
KERNEL_STATUS_PARAM_INVALID, "UnpackWithDimZero failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
KERNEL_CHECK_FALSE((UnpackCompute<T>(input_data_ptr, output_data_vec, ctx) == KERNEL_STATUS_OK),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Unpack Compute failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t UnpackCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_CHECK_FALSE((CheckAndInitParams(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"CheckAndInitParams failed.");
|
||||
switch (data_type) {
|
||||
case DT_FLOAT16:
|
||||
return DoCompute<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return DoCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<double>(ctx);
|
||||
case DT_BOOL:
|
||||
return DoCompute<bool>(ctx);
|
||||
case DT_INT8:
|
||||
return DoCompute<int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return DoCompute<int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return DoCompute<uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return DoCompute<uint16_t>(ctx);
|
||||
case DT_UINT32:
|
||||
return DoCompute<uint32_t>(ctx);
|
||||
case DT_UINT64:
|
||||
return DoCompute<uint64_t>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return DoCompute<std::complex<float>>(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return DoCompute<std::complex<double>>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported data type [%s]", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kUnpack, UnpackCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,65 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_UNPACK_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_UNPACK_H_
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "securec.h"
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
|
||||
namespace aicpu {
|
||||
class UnpackCpuKernel : public CpuKernel {
|
||||
public:
|
||||
UnpackCpuKernel() : data_type(DT_DOUBLE), unpack_axis(0), unpack_num(0), value_num(0) {
|
||||
output_ptr_vec.clear();
|
||||
value_shape_vec.clear();
|
||||
}
|
||||
~UnpackCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t CheckAndInitParams(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t UnpackWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec);
|
||||
|
||||
template <typename T>
|
||||
uint32_t UnpackWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec);
|
||||
|
||||
template <typename T>
|
||||
uint32_t UnpackCompute(T *input_data_ptr, std::vector<T *> output_data_vec, CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
|
||||
private:
|
||||
DataType data_type;
|
||||
uint64_t unpack_axis;
|
||||
int64_t unpack_num;
|
||||
int64_t value_num;
|
||||
void *value_data_ptr;
|
||||
std::vector<void *> output_ptr_vec;
|
||||
std::vector<int64_t> value_shape_vec;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,120 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "unravel_index.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *KUnravelIndex = "UnravelIndex";
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const int64_t kParallelDataNumSameShape = 1000;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t UnravelIndexCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_INT32: {
|
||||
KERNEL_HANDLE_ERROR(DataAndTypeCheck<int32_t>(ctx), " data or type check failed.");
|
||||
UnravelCompute<int32_t>(ctx);
|
||||
break;
|
||||
}
|
||||
case DT_INT64: {
|
||||
KERNEL_HANDLE_ERROR(DataAndTypeCheck<int64_t>(ctx), " data or type check failed.");
|
||||
UnravelCompute<int64_t>(ctx);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
KERNEL_LOG_ERROR("UnravelIndex kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t UnravelIndexCpuKernel::DataAndTypeCheck(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Unravel_Index check input and output number failed.");
|
||||
Tensor *indices = ctx.Input(0);
|
||||
Tensor *dims = ctx.Input(1);
|
||||
auto dims_number = ctx.Input(1)->NumElements();
|
||||
auto indices_number = ctx.Input(0)->NumElements();
|
||||
auto dims_data = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto indices_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto indices_type = indices->GetDataType();
|
||||
auto dims_type = dims->GetDataType();
|
||||
T dims_multi = 1;
|
||||
KERNEL_CHECK_FALSE((indices_type == dims_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(indices_type).c_str(), DTypeStr(dims_type).c_str())
|
||||
|
||||
for (auto i = 0; i < dims_number; i++) {
|
||||
KERNEL_CHECK_FALSE((*(dims_data + i) > 0), KERNEL_STATUS_PARAM_INVALID, "Dimension number must be more than 0.")
|
||||
dims_multi = dims_multi * (*(dims_data + i));
|
||||
}
|
||||
for (auto i = 0; i < indices_number; i++) {
|
||||
KERNEL_CHECK_FALSE((*(indices_data + i) >= 0), KERNEL_STATUS_PARAM_INVALID, "Indice number must be more than 0.")
|
||||
KERNEL_CHECK_FALSE((*(indices_data + i) <= dims_multi), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Index is out of bound as with dims");
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t UnravelIndexCpuKernel ::UnravelCompute(CpuKernelContext &ctx) {
|
||||
auto indices_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto dims_data = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto dims_number = ctx.Input(1)->NumElements();
|
||||
auto indices_number = ctx.Input(0)->NumElements();
|
||||
auto data_num = indices_number;
|
||||
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_unravel_index = [&](size_t start, size_t end) {
|
||||
for (auto j = start; j < end; j++) {
|
||||
T Quotient = *(indices_data + j);
|
||||
for (auto i = (dims_number - 1); i >= 0; i--) {
|
||||
*(output_data + i + j * dims_number) = Quotient % *(dims_data + i);
|
||||
Quotient = Quotient / *(dims_data + i);
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_unravel_index),
|
||||
"Unravel Index Compute failed.");
|
||||
} else {
|
||||
for (auto j = 0; j < indices_number; j++) {
|
||||
T Quotient = *(indices_data + j);
|
||||
for (auto i = (dims_number - 1); i >= 0; i--) {
|
||||
*(output_data + i + j * dims_number) = Quotient % *(dims_data + i);
|
||||
Quotient = Quotient / *(dims_data + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(KUnravelIndex, UnravelIndexCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_UNRAVEL_INDEX_
|
||||
#define AICPU_KERNELS_NORMALIZED_UNRAVEL_INDEX_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class UnravelIndexCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~UnravelIndexCpuKernel() = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t UnravelCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,167 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "unsorted_segment_sum.h"
|
||||
|
||||
#include <string>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kUnsortedSegmentSum = "UnsortedSegmentSum";
|
||||
const uint32_t input_num = 3;
|
||||
const uint32_t output_num = 1;
|
||||
constexpr int64_t kParallelDataNums = 64 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename input_t, typename segment_ids_t, typename num_segments_t>
|
||||
uint32_t UnsortedSegmentSumCpuKernel::UnsortedSegmentSumComputeTemplate(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_num, output_num), " node input size should be [%llu], get [%llu]",
|
||||
input_num, ctx.GetInputsSize(), " node output size should be [%llu], get [%llu]", output_num,
|
||||
ctx.GetOutputsSize());
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s]",
|
||||
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data size of the input [%llu] need be the same as the output "
|
||||
"[%llu]",
|
||||
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
int64_t data_size = ctx.Input(0)->NumElements();
|
||||
int64_t id_size = ctx.Input(1)->NumElements();
|
||||
|
||||
auto input_x = reinterpret_cast<input_t *>(ctx.Input(0)->GetData());
|
||||
KERNEL_CHECK_NULLPTR(input_x, KERNEL_STATUS_PARAM_INVALID, "Get input data failed")
|
||||
auto output_y = reinterpret_cast<input_t *>(ctx.Output(0)->GetData());
|
||||
KERNEL_CHECK_NULLPTR(output_y, KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
|
||||
auto segmentids = reinterpret_cast<segment_ids_t *>(ctx.Input(1)->GetData());
|
||||
KERNEL_CHECK_NULLPTR(segmentids, KERNEL_STATUS_PARAM_INVALID, "Get segment_ids failed")
|
||||
auto numsegments = reinterpret_cast<num_segments_t *>(ctx.Input(2)->GetData());
|
||||
KERNEL_CHECK_NULLPTR(numsegments, KERNEL_STATUS_PARAM_INVALID, "Get num_segments failed")
|
||||
if (id_size <= 0) {
|
||||
KERNEL_LOG_ERROR("segment_ids num elements should great than 0");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
int64_t reshapesize = data_size / id_size;
|
||||
// Initialized to 0
|
||||
memset(output_y, 0, ctx.Output(0)->GetDataSize());
|
||||
if (data_size <= kParallelDataNums) {
|
||||
// calculation process
|
||||
for (int64_t i = 0; i < id_size; i++) {
|
||||
if (*(segmentids + i) < *numsegments) {
|
||||
for (int64_t j = 0; j < reshapesize; j++) {
|
||||
*(output_y + *(segmentids + i) * reshapesize + j) += *(input_x + i * reshapesize + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > reshapesize) {
|
||||
max_core_num = reshapesize;
|
||||
}
|
||||
// calculation process
|
||||
auto shard_unsorted_segment_sum = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = 0; i < id_size; i++) {
|
||||
if (*(segmentids + i) < *numsegments) {
|
||||
for (int64_t j = start; j < end; j++) {
|
||||
*(output_y + *(segmentids + i) * reshapesize + j) += *(input_x + i * reshapesize + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, reshapesize, reshapesize / max_core_num, shard_unsorted_segment_sum),
|
||||
"CpuKernelUtils::ParallelFor failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename input_t, typename segment_ids_t>
|
||||
uint32_t UnsortedSegmentSumCpuKernel::DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type) {
|
||||
switch (num_segments_type) {
|
||||
case DT_INT32:
|
||||
return UnsortedSegmentSumComputeTemplate<input_t, segment_ids_t, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return UnsortedSegmentSumComputeTemplate<input_t, segment_ids_t, int64_t>(ctx);
|
||||
|
||||
default:
|
||||
KERNEL_LOG_ERROR("UnsortedSegmentSum invalid num_segments_type type [%s]", DTypeStr(num_segments_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename input_t>
|
||||
uint32_t UnsortedSegmentSumCpuKernel::DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type) {
|
||||
auto num_segments_type = ctx.Input(2)->GetDataType();
|
||||
switch (segment_ids_type) {
|
||||
case DT_INT32:
|
||||
return DoComputeWithNumSegmentsType<input_t, int32_t>(ctx, num_segments_type);
|
||||
case DT_INT64:
|
||||
return DoComputeWithNumSegmentsType<input_t, int64_t>(ctx, num_segments_type);
|
||||
|
||||
default:
|
||||
KERNEL_LOG_ERROR("UnsortedSegmentSum invalid segment_ids_type type [%s]", DTypeStr(segment_ids_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t UnsortedSegmentSumCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
auto input_type = ctx.Input(0)->GetDataType();
|
||||
auto segment_ids_type = ctx.Input(1)->GetDataType();
|
||||
switch (input_type) {
|
||||
case DT_INT32:
|
||||
return DoComputeWithSegmentIdsType<int32_t>(ctx, segment_ids_type);
|
||||
case DT_INT16:
|
||||
return DoComputeWithSegmentIdsType<int16_t>(ctx, segment_ids_type);
|
||||
case DT_FLOAT:
|
||||
return DoComputeWithSegmentIdsType<float>(ctx, segment_ids_type);
|
||||
case DT_DOUBLE:
|
||||
return DoComputeWithSegmentIdsType<double>(ctx, segment_ids_type);
|
||||
case DT_FLOAT16:
|
||||
return DoComputeWithSegmentIdsType<Eigen::half>(ctx, segment_ids_type);
|
||||
case DT_INT8:
|
||||
return DoComputeWithSegmentIdsType<int8_t>(ctx, segment_ids_type);
|
||||
case DT_INT64:
|
||||
return DoComputeWithSegmentIdsType<int64_t>(ctx, segment_ids_type);
|
||||
case DT_UINT8:
|
||||
return DoComputeWithSegmentIdsType<uint8_t>(ctx, segment_ids_type);
|
||||
case DT_UINT16:
|
||||
return DoComputeWithSegmentIdsType<uint16_t>(ctx, segment_ids_type);
|
||||
case DT_UINT32:
|
||||
return DoComputeWithSegmentIdsType<uint32_t>(ctx, segment_ids_type);
|
||||
case DT_UINT64:
|
||||
return DoComputeWithSegmentIdsType<uint64_t>(ctx, segment_ids_type);
|
||||
case DT_COMPLEX64:
|
||||
return DoComputeWithSegmentIdsType<std::complex<float>>(ctx, segment_ids_type);
|
||||
case DT_COMPLEX128:
|
||||
return DoComputeWithSegmentIdsType<std::complex<double>>(ctx, segment_ids_type);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("UnsortedSegmentSum invalid input type [%s]", DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kUnsortedSegmentSum, UnsortedSegmentSumCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_SUM_H
|
||||
#define AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_SUM_H
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class UnsortedSegmentSumCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~UnsortedSegmentSumCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename input_t, typename segment_ids_t, typename num_segments_t>
|
||||
uint32_t UnsortedSegmentSumComputeTemplate(CpuKernelContext &ctx);
|
||||
template <typename input_t, typename segment_ids_t>
|
||||
uint32_t DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type);
|
||||
template <typename input_t>
|
||||
uint32_t DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,153 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "upper_bound.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 2;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kUpperBound = "UpperBound";
|
||||
|
||||
#define UPPERBOUND_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = UpperBoundCompute<TYPE1, TYPE2>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("UpperBound kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define UPPERBOUND_COMPUTE_CASE_ALL(TYPE, CTX) \
|
||||
UPPERBOUND_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
|
||||
UPPERBOUND_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
|
||||
UPPERBOUND_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
|
||||
UPPERBOUND_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
|
||||
UPPERBOUND_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
|
||||
UPPERBOUND_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
|
||||
UPPERBOUND_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
|
||||
UPPERBOUND_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
|
||||
UPPERBOUND_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t UpperBoundCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "UpperBound check input and output number failed.");
|
||||
Tensor *sorted_x_data = ctx.Input(0);
|
||||
Tensor *values_data = ctx.Input(1);
|
||||
Tensor *output_data = ctx.Output(0);
|
||||
auto output_type = output_data->GetDataType();
|
||||
auto sorted_x_type = sorted_x_data->GetDataType();
|
||||
auto values_type = values_data->GetDataType();
|
||||
if (sorted_x_type != values_type) {
|
||||
KERNEL_LOG_ERROR("Input[0] data type[%s] must be same with Input[1] data type[%s]", DTypeStr(sorted_x_type).c_str(),
|
||||
DTypeStr(values_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
switch (output_type) {
|
||||
case DT_INT32:
|
||||
switch (sorted_x_type) {
|
||||
UPPERBOUND_COMPUTE_CASE_ALL(int32_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
case DT_INT64:
|
||||
switch (sorted_x_type) {
|
||||
UPPERBOUND_COMPUTE_CASE_ALL(int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Output data type[%s] not supported.", DTypeStr(output_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t UpperBoundCpuKernel::UpperBoundCompute(CpuKernelContext &ctx) {
|
||||
Tensor *sorted_x_data = ctx.Input(0);
|
||||
auto sorted_x_data_addr = reinterpret_cast<T1 *>(sorted_x_data->GetData());
|
||||
auto sorted_x_data_shape = sorted_x_data->GetTensorShape();
|
||||
std::vector<int64_t> sorted_x_data_shape_dims = sorted_x_data_shape->GetDimSizes();
|
||||
Tensor *values_data = ctx.Input(1);
|
||||
auto values_data_addr = reinterpret_cast<T1 *>(values_data->GetData());
|
||||
auto values_data_shape = values_data->GetTensorShape();
|
||||
int64_t values_data_num = values_data_shape->NumElements();
|
||||
std::vector<int64_t> values_data_shape_dims = values_data_shape->GetDimSizes();
|
||||
Tensor *output_data = ctx.Output(0);
|
||||
auto output_data_addr = reinterpret_cast<T2 *>(output_data->GetData());
|
||||
if (sorted_x_data_shape_dims[0] != values_data_shape_dims[0]) {
|
||||
KERNEL_LOG_ERROR("The number of rows of Input[0]:([%d]) should be consistent with that of Input[1]:([%d]).",
|
||||
sorted_x_data_shape_dims[0], values_data_shape_dims[0]);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
int64_t sorted_x_data_column = sorted_x_data_shape_dims[1];
|
||||
int64_t values_data_column = values_data_shape_dims[1];
|
||||
if (values_data_num < 1024) {
|
||||
for (int64_t i = 0; i < values_data_num; i++) {
|
||||
int64_t seq_row = i / values_data_column;
|
||||
int64_t low = seq_row * sorted_x_data_column;
|
||||
int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
|
||||
int64_t mid;
|
||||
while (low <= up) {
|
||||
mid = (low + up) / 2;
|
||||
if (values_data_addr[i] < sorted_x_data_addr[mid]) {
|
||||
up = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
output_data_addr[i] = low - seq_row * sorted_x_data_column;
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t sum_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (sum_core_num > values_data_num) {
|
||||
sum_core_num = values_data_num;
|
||||
}
|
||||
auto shard_compute = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
int64_t seq_row = i / values_data_column;
|
||||
int64_t low = seq_row * sorted_x_data_column;
|
||||
int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
|
||||
int64_t mid;
|
||||
while (low <= up) {
|
||||
mid = (low + up) / 2;
|
||||
if (values_data_addr[i] < sorted_x_data_addr[mid]) {
|
||||
up = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
output_data_addr[i] = low - seq_row * sorted_x_data_column;
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, values_data_num, values_data_num / sum_core_num, shard_compute),
|
||||
"UpperBound Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kUpperBound, UpperBoundCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_UPPERBOUND_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_UPPERBOUND_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class UpperBoundCpuKernel : public CpuKernel {
|
||||
public:
|
||||
UpperBoundCpuKernel() = default;
|
||||
~UpperBoundCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T1, typename T2>
|
||||
static uint32_t UpperBoundCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,192 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "xdivy.h"
|
||||
|
||||
#include <complex>
|
||||
|
||||
#include "cmath"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kXdivy = "Xdivy";
|
||||
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
constexpr double EPSLON = 1e-15;
|
||||
|
||||
#define XDIVY_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = XdivyCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Xdivy kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t XdivyCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kXdivy);
|
||||
BCalcInfo calc_info;
|
||||
KERNEL_HANDLE_ERROR(XdivyParamCheck(ctx), "Xdivy check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
XDIVY_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
XDIVY_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
XDIVY_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
XDIVY_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
XDIVY_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Xdivy kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t XdivyCpuKernel::XdivyParamCheck(CpuKernelContext &ctx) {
|
||||
// the non null of input_0, input_1, output has been verified in NormalCheck
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"XdivyCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t XdivyCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
|
||||
auto input1 = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input2 = static_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto output = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(output + i) = *(input1 + i) / *(input2 + i) + static_cast<T>(EPSLON);
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(output + i) = (*input1) / *(input2 + i) + static_cast<T>(EPSLON);
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(output + i) = *(input1 + i) / (*input2) + static_cast<T>(EPSLON);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t XdivyCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
|
||||
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
BcastShapeType type = in0_elements_nums == in1_elements_nums
|
||||
? BcastShapeType::SAME_SHAPE
|
||||
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
auto sharder_div = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
|
||||
"Xdivy Compute failed.");
|
||||
} else {
|
||||
SpecialCompute<T>(type, 0, data_num, ctx);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t XdivyCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
auto in0 = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = static_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
auto sharder_div = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) =
|
||||
*(in0 + bcast.GetBroadcastXIndex(i)) / *(in1 + bcast.GetBroadcastYIndex(i)) + static_cast<T>(EPSLON);
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
|
||||
"Xdivy Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) / *(in1 + bcast.GetBroadcastYIndex(i)) + static_cast<T>(EPSLON);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t XdivyCpuKernel::XdivyCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
bool noNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
|
||||
if (noNeedBcast) {
|
||||
return NoBcastCompute<T>(ctx);
|
||||
} else {
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kXdivy, XdivyCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,52 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_XDIVY_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_XDIVY_H_
|
||||
#define EIGEN_USE_THREADS
|
||||
#define EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class XdivyCpuKernel : public CpuKernel {
|
||||
public:
|
||||
XdivyCpuKernel() = default;
|
||||
~XdivyCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t XdivyParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
|
||||
template <typename T>
|
||||
uint32_t XdivyCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,216 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "xlogy.h"
|
||||
|
||||
#include "cmath"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kXlogy = "Xlogy";
|
||||
|
||||
const int64_t kParallelDataNum = 8 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
|
||||
#define XLOGY_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = XlogyCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Xlogy kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t XlogyCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kXlogy);
|
||||
BCalcInfo calc_info;
|
||||
KERNEL_HANDLE_ERROR(XlogyParamCheck(ctx), "Xlogy check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
XLOGY_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
XLOGY_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
XLOGY_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Xlogy kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t XlogyCpuKernel::XlogyParamCheck(CpuKernelContext &ctx) {
|
||||
// the non null of input_0, input_1, output has been verified in NormalCheck
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"XlogyCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t XlogyCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
|
||||
const T *input2, T *output) {
|
||||
auto zero = T(0);
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(input1 + i) == zero) {
|
||||
*(output + i) = zero;
|
||||
continue;
|
||||
}
|
||||
if (*(input2 + i) < zero) {
|
||||
*(output + i) = std::numeric_limits<T>::quiet_NaN();
|
||||
continue;
|
||||
}
|
||||
*(output + i) = *(input1 + i) * log(*(input2 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(input1) == zero) {
|
||||
*(output + i) = zero;
|
||||
continue;
|
||||
}
|
||||
if (*(input2 + i) < zero) {
|
||||
*(output + i) = std::numeric_limits<T>::quiet_NaN();
|
||||
continue;
|
||||
}
|
||||
*(output + i) = (*input1) * log(*(input2 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(input1 + i) == zero) {
|
||||
*(output + i) = zero;
|
||||
continue;
|
||||
}
|
||||
if (*(input2) < zero) {
|
||||
*(output + i) = std::numeric_limits<T>::quiet_NaN();
|
||||
continue;
|
||||
}
|
||||
*(output + i) = *(input1 + i) * log(*(input2));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t XlogyCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
|
||||
auto in0 = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = static_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
BcastShapeType type = in0_elements_nums == in1_elements_nums
|
||||
? BcastShapeType::SAME_SHAPE
|
||||
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
auto sharder_div = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, in0, in1, out); };
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
|
||||
"Xlogy Compute failed.");
|
||||
} else {
|
||||
SpecialCompute<T>(type, 0, data_num, in0, in1, out);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t XlogyCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
auto in0 = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = static_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
auto zero = T(0);
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
auto sharder_div = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) = *(in1 + i) >= zero
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i)) * log(*(in1 + bcast.GetBroadcastYIndex(i)))
|
||||
: std::numeric_limits<T>::quiet_NaN();
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
|
||||
"Xlogy Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
*(out + i) = *(in1 + i) >= zero ? *(in0 + bcast.GetBroadcastXIndex(i)) * log(*(in1 + bcast.GetBroadcastYIndex(i)))
|
||||
: std::numeric_limits<T>::quiet_NaN();
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t XlogyCpuKernel::XlogyCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
bool noNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
|
||||
if (noNeedBcast) {
|
||||
return NoBcastCompute<T>(ctx);
|
||||
} else {
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kXlogy, XlogyCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,52 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_XLOGY_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_XLOGY_H_
|
||||
#define EIGEN_USE_THREADS
|
||||
#define EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class XlogyCpuKernel : public CpuKernel {
|
||||
public:
|
||||
XlogyCpuKernel() = default;
|
||||
~XlogyCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t XlogyParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
|
||||
template <typename T>
|
||||
uint32_t XlogyCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,61 @@
|
|||
|
||||
|
||||
#ifndef AICPU_UtILS_SPARSE_DENSE_CWISE_UTILS_H_
|
||||
#define AICPU_UtILS_SPARSE_DENSE_CWISE_UTILS_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
struct AddOp {
|
||||
static std::string Name() { return "Add"; }
|
||||
};
|
||||
|
||||
struct DivOp {
|
||||
static std::string Name() { return "Div"; }
|
||||
};
|
||||
|
||||
struct MulOp {
|
||||
static std::string Name() { return "Mul"; }
|
||||
};
|
||||
|
||||
template <typename Op>
|
||||
class SparseDenseCwiseOpKernel : public CpuKernel {
|
||||
public:
|
||||
SparseDenseCwiseOpKernel() = default;
|
||||
~SparseDenseCwiseOpKernel() override = default;
|
||||
|
||||
protected:
|
||||
virtual uint32_t Compute(CpuKernelContext &ctx) override = 0;
|
||||
|
||||
static uint32_t CheckParams(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SparseDenseCwiseOpSpecialCompute(BcastShapeType type, CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t SparseDenseCwiseOpSpecialComputeComplex(BcastShapeType type, CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ComputeOp(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ComputeOpComplex(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SparseDenseCwiseOpNoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SparseDenseCwiseOpNoBcastComputeComplex(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SparseDenseCwiseOpBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SparseDenseCwiseOpBcastComputeComplex(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SparseDenseCwiseOpCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -67,6 +67,14 @@ class SparseTensor {
|
|||
* sparse eigen tensor indices valid
|
||||
* @return uint32_t: 0->success other->failed
|
||||
*/
|
||||
int dims() const { return dims_; }
|
||||
|
||||
std::shared_ptr<EigenTensor> indices() const { return ix_; }
|
||||
|
||||
std::shared_ptr<EigenTensor> values() const { return vals_; }
|
||||
|
||||
std::vector<int64_t> shape() const { return shape_; }
|
||||
|
||||
template <typename T>
|
||||
uint32_t EigenTensorIndicesValidCheck(int64_t dims_size) const {
|
||||
const auto ix_t = ix_->matrix<T>();
|
||||
|
|
|
@ -78,9 +78,75 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
|
|||
mindspore::kMaskedSelectOpName,
|
||||
mindspore::kMaskedSelectGradOpName,
|
||||
mindspore::kMedianOpName,
|
||||
mindspore::kACosGradOpName,
|
||||
mindspore::kAcoshGradOpName,
|
||||
mindspore::kAdaptiveAvgPool3DOpName,
|
||||
mindspore::kAdaptiveAvgPool3DGradOpName,
|
||||
mindspore::kAdaptiveMaxPool2DGradOpName,
|
||||
mindspore::kAdaptiveMaxPool3DOpName,
|
||||
mindspore::kAdaptiveMaxPool3DGradOpName,
|
||||
mindspore::kAddNOpName,
|
||||
mindspore::kAddV2OpName,
|
||||
mindspore::kAdjustContrastv2OpName,
|
||||
mindspore::kAdjustHueOpName,
|
||||
mindspore::kAdjustSaturationOpName,
|
||||
mindspore::kAffineGridGradOpName,
|
||||
mindspore::kAngleOpName,
|
||||
mindspore::kArgmaxOpName,
|
||||
mindspore::kArgMaxWithValueOpName,
|
||||
mindspore::kArgMinOpName,
|
||||
mindspore::kArgMinWithValueOpName,
|
||||
mindspore::KAsinGradOpName,
|
||||
mindspore::KAsinhGradOpName,
|
||||
mindspore::kAvgPoolOpName,
|
||||
mindspore::kAvgPoolGradOpName,
|
||||
mindspore::kBartlettWindowOpName,
|
||||
mindspore::kBatchNormGradGradOpName,
|
||||
mindspore::kBiasAddOpName,
|
||||
mindspore::kBiasAddGradOpName,
|
||||
mindspore::kBincountOpName,
|
||||
mindspore::kBlackmanWindowOpName,
|
||||
mindspore::kBroadcastOpName,
|
||||
mindspore::kMedianGradOpName,
|
||||
mindspore::kNMSWithMaskOpName,
|
||||
mindspore::kReduceSumOpName,
|
||||
mindspore::kSpaceToDepthOpName,
|
||||
mindspore::kSparseAddmmOpName,
|
||||
mindspore::kSparseApplyAdagradDAOpName,
|
||||
mindspore::kSparseApplyCenteredRMSPropOpName,
|
||||
mindspore::kSparseApplyMomentumOpName,
|
||||
mindspore::kSparseApplyProximalGradientDescentOpName,
|
||||
mindspore::kSparseConcatOpName,
|
||||
mindspore::kSparseDenseCwiseAddOpName,
|
||||
mindspore::kSparseDenseCwiseDivOpName,
|
||||
mindspore::kSparseDenseCwiseMulOpName,
|
||||
mindspore::kSparseMatrixMatMulOpName,
|
||||
mindspore::kSparseMatrixNNZOpName,
|
||||
mindspore::kSparseMatrixTransposeOpName,
|
||||
mindspore::kSparseFillEmptyRowsGradOpName,
|
||||
mindspore::kSparseReshapeOpName,
|
||||
mindspore::kSparseSegmentSqrtNGradOpName,
|
||||
mindspore::kSparseSegmentSqrtNWithNumSegmentsOpName,
|
||||
mindspore::kSparseSoftmaxCrossEntropyWithLogitsOpName,
|
||||
mindspore::kSparseSparseMaximumOpName,
|
||||
mindspore::kSparseSparseMinimumOpName,
|
||||
mindspore::kSparseSegmentSumWithNumSegmentsOpName,
|
||||
mindspore::kSplitOpName,
|
||||
mindspore::kSqrtOpName,
|
||||
mindspore::kSqrtGradOpName,
|
||||
mindspore::kTanhOpName,
|
||||
mindspore::kTileOpName,
|
||||
mindspore::kTridiagonalMatMulOpName,
|
||||
mindspore::kTripletMarginLossOpName,
|
||||
mindspore::kTransposeOpName,
|
||||
mindspore::kTriuIndicesOpName,
|
||||
mindspore::kTrilIndicesOpName,
|
||||
mindspore::kUnpackOpName,
|
||||
mindspore::kUnravelIndexOpName,
|
||||
mindspore::kUnsortedSegmentSumOpName,
|
||||
mindspore::kUpperBoundOpName,
|
||||
mindspore::kXlogyOpName,
|
||||
mindspore::kXdivyOpName,
|
||||
mindspore::kFFTWithSizeOpName,
|
||||
mindspore::kHistogramDOpName,
|
||||
mindspore::kIm2colOpName,
|
||||
|
|
|
@ -410,7 +410,7 @@ class _MindsporeFunctionExecutor:
|
|||
# Case: If the shape of input args is dynamic, get dynamic shape tensor from context and use it to compile.
|
||||
compile_args = args_list
|
||||
# Case: The `set_inputs()` of Cell object has been set, using these dynamic shape args as compile args.
|
||||
if isinstance(self.obj, ms.nn.Cell) and self.obj.get_inputs():
|
||||
if self.fn.__name__ == 'construct' and isinstance(self.obj, ms.nn.Cell) and self.obj.get_inputs():
|
||||
compile_args = self.obj.get_inputs()
|
||||
for args in compile_args:
|
||||
Validator.check_isinstance("args set in `set_inputs()` of Cell", args, PythonTensor)
|
||||
|
|
|
@ -13,9 +13,48 @@
|
|||
# limitations under the License.
|
||||
|
||||
"""aicpu ops"""
|
||||
from .adaptive_max_pool_3d_grad import _adaptive_max_pool_3d_grad_aicpu
|
||||
from .adaptive_max_pool_3d import _adaptive_max_pool_3d_aicpu
|
||||
from .adaptive_max_pool_2d_grad import _adaptive_max_pool_2d_grad_aicpu
|
||||
from .adaptive_avg_pool_3d_grad import _adaptiveavgpool3d_grad_aicpu
|
||||
from .adaptive_avg_pool_3d import _adaptiveavgpool3d_aicpu
|
||||
from .tile import _tile_aicpu
|
||||
from .tanh import _tanh_aicpu
|
||||
from .space_to_depth import _space_to_depth_aicpu
|
||||
from .sparse_matrix_transpose import _sparse_matrix_transpose_aicpu
|
||||
from .sparse_matrix_nnz import _sparse_matrix_nnz_aicpu
|
||||
from .sparse_matrix_mat_mul import _sparse_matrix_mat_mul_aicpu
|
||||
from .sparse_dense_cwise_mul import _sparse_dense_cwise_mul_aicpu
|
||||
from .sparse_dense_cwise_div import _sparse_dense_cwise_div_aicpu
|
||||
from .sparse_dense_cwise_add import _sparse_dense_cwise_add_aicpu
|
||||
from .sparse_concat import _sparse_concat_aicpu
|
||||
from .sparse_apply_proximal_gradient_descent import _sparse_apply_proximal_gradient_descent_aicpu
|
||||
from .sparse_apply_momentum import _sparse_apply_momentum_aicpu
|
||||
from .sparse_apply_centered_rms_prop import _sparse_apply_centered_rms_prop_aicpu
|
||||
from .sparse_apply_adagrad_da import _sparse_apply_adagrad_da_aicpu
|
||||
from .sparseaddmm import _sparse_addmm_aicpu
|
||||
from .broadcast_to import _broadcast_to_aicpu
|
||||
from .blackman_window import _blackman_window_aicpu
|
||||
from .bincount import _bincount_aicpu
|
||||
from .asinh_grad import _asinh_grad_aicpu
|
||||
from .unique import _unique_aicpu
|
||||
from .add_n import _add_n_aicpu
|
||||
from .add_v2 import _add_v2_aicpu
|
||||
from .adjust_contrastv2 import _adjust_contrastv2_aicpu
|
||||
from .adjust_hue import _adjust_hue_aicpu
|
||||
from .adjust_saturation import _adjust_saturation_aicpu
|
||||
from .affine_grid_grad import _affine_grid_grad_aicpu
|
||||
from .angle import _angle_aicpu
|
||||
from .arg_max import _arg_max_aicpu
|
||||
from .argmax_with_value import _argmax_with_value_aicpu
|
||||
from .arg_min import _arg_min_aicpu
|
||||
from .argmin_with_value import _argmin_with_value_aicpu
|
||||
from .avgpool_v1 import _avgpool_v1_aicpu
|
||||
from .avgpool_grad_v1 import _avgpool_grad_v1_aicpu
|
||||
from .matrix_solve import _matrix_solve_aicpu
|
||||
from .betainc import _betainc_aicpu
|
||||
from .bartlett_window import _bartlett_window_aicpu
|
||||
from .batch_norm_grad_grad import _batch_norm_grad_grad_aicpu
|
||||
from .no_repeat_ngram import _no_repeat_ngram_aicpu
|
||||
from .init_data_set_queue import _init_data_set_queue_aicpu
|
||||
from .embedding_lookup import _embedding_lookup_aicpu
|
||||
|
@ -43,6 +82,7 @@ from .topk import _top_k_aicpu
|
|||
from .tensor_scatter_update import _tensor_scatter_update_aicpu
|
||||
from .log1p import _log1p_aicpu
|
||||
from .asin import _asin_aicpu
|
||||
from .asin_grad import _asin_grad_aicpu
|
||||
from .is_finite import _is_finite_aicpu
|
||||
from .is_inf import _is_inf_aicpu
|
||||
from .is_nan import _is_nan_aicpu
|
||||
|
@ -52,14 +92,18 @@ from .cosh import _cosh_aicpu
|
|||
from .sign import _sign_aicpu
|
||||
from .squeeze import _squeeze_aicpu
|
||||
from .acos import _acos_aicpu
|
||||
from .acos_grad import _acos_grad_aicpu
|
||||
from .expand import _expand_aicpu
|
||||
from .expand_dims import _expand_dims_aicpu
|
||||
from .randperm import _randperm_aicpu
|
||||
from .random_choice_with_mask import _random_choice_with_mask_aicpu
|
||||
from .rsqrt import _rsqrt_aicpu
|
||||
from .sqrt import _sqrt_aicpu
|
||||
from .sqrt_grad import _sqrt_grad_aicpu
|
||||
from .search_sorted import _search_sorted_aicpu
|
||||
from .stack import _stack_aicpu
|
||||
from .unstack import _unstack_aicpu
|
||||
from .unsorted_segment_sum import _unsorted_segment_sum_aicpu
|
||||
from .addcmul import _addcmul_aicpu
|
||||
from .uniform_candidate_sampler import _uniform_candidate_sampler_aicpu
|
||||
from .log_uniform_candidate_sampler import _log_uniform_candidate_sampler_aicpu
|
||||
|
@ -69,6 +113,7 @@ from .reverse_sequence import _reverse_sequence_aicpu
|
|||
from .log_matrix_determinant import _log_matrix_determinant_aicpu
|
||||
from .crop_and_resize import _crop_and_resize_aicpu
|
||||
from .acosh import _acosh_aicpu
|
||||
from .acosh_grad import _acosh_grad_aicpu
|
||||
from .rnnt_loss import _rnnt_loss_aicpu
|
||||
from .random_categorical import _random_categorical_aicpu
|
||||
from .tanh_grad import _tanh_grad_aicpu
|
||||
|
@ -86,6 +131,7 @@ from .sub import _sub_aicpu
|
|||
from .not_equal import _not_equal_aicpu
|
||||
from .poisson import _poisson_aicpu
|
||||
from .update_cache import _update_cache_aicpu
|
||||
from .upper_bound import _upper_bound_aicpu
|
||||
from .cache_swap_table import _cache_swap_table_aicpu
|
||||
from .uniform_int import _uniform_int_aicpu
|
||||
from .uniform_real import _uniform_real_aicpu
|
||||
|
@ -97,6 +143,23 @@ from .end_of_sequence import _end_of_sequence_aicpu
|
|||
from .fused_sparse_adam import _fused_sparse_adam_aicpu
|
||||
from .fused_sparse_lazy_adam import _fused_sparse_lazy_adam_aicpu
|
||||
from .fused_sparse_ftrl import _fused_sparse_ftrl_aicpu
|
||||
from .sparse_fill_empty_rows_grad import _sparse_fill_empty_rows_grad_aicpu
|
||||
from .sparse_reshape import _sparse_reshape_aicpu
|
||||
from .sparse_segment_sqrt_n_grad import _sparse_segment_sqrt_n_grad_aicpu
|
||||
from .sparse_segment_mean_with_num_segments import _sparse_segment_mean_with_num_segments_aicpu
|
||||
from .sparse_segment_sum_with_num_segments import _sparse_segment_sum_with_num_segments_aicpu
|
||||
from .sparse_softmax_cross_entropy_with_logits_v2 import _sparse_softmax_cross_entropy_with_logits_v2_aicpu
|
||||
from .sparsesparsemaximum import _sparsesparsemaximum_aicpu
|
||||
from .sparse_sparse_minimum import _sparse_sparse_minimum_aicpu
|
||||
from .split import _split_aicpu
|
||||
from .transpose import _transpose_aicpu
|
||||
from .tridiagonal_matmul import _tridiagonal_matmul_aicpu
|
||||
from .tril_indices import _tril_indices_aicpu
|
||||
from .triu_indices import _triu_indices_aicpu
|
||||
from .triplet_margin_loss import _triplet_margin_loss_aicpu
|
||||
from .unravel_index import _unravel_index_aicpu
|
||||
from .xlogy import _xlogy_aicpu
|
||||
from .xdivy import _xdivy_aicpu
|
||||
from .fused_sparse_proximal_adagrad import _fused_sparse_proximal_adagrad_aicpu
|
||||
from .meshgrid import _meshgrid_aicpu
|
||||
from .div import _div_aicpu
|
||||
|
|
Loading…
Reference in New Issue