merge canndev code to mindspore

This commit is contained in:
shen_jingxing 2023-02-07 14:35:40 +08:00
parent aacab0ca60
commit 0dd977ccef
50 changed files with 6861 additions and 0 deletions

View File

@ -97,6 +97,9 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "knownConditionTrueFalse"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "passedByValue"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitMemberVar"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "syntaxError"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unusedVariable"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowArgument"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"

View File

@ -134,5 +134,6 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/operators"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/comma"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "runtime/indentation_namespace"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/blank_line"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/line_length"

View File

@ -344,6 +344,15 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool.cc:aicpu::FractionalAvgPoolCpuKernel::DoCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/densetosparsesetoperation.cc:aicpu::DenseToSparseSetOperationCpuKernel::ComputeDenseToSparse
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_dense_cwise_utils.cc:aicpu::SparseDenseCwiseOpKernel<Op>::SparseDenseCwiseOpSpecialCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/resize_area.cc:aicpu::ResizeAreaCpuKernel::DoCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/trace_grad.cc:aicpu::TraceGradCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sspaddmm.cc:aicpu::SspaddmmCpuKernel::ValidParam
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_sum.cc:aicpu::SegmentSumCpuKernel::SegmentSumCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_min.cc:aicpu::SegmentMinCpuKernel::SegmentMinCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_mat_mul.cc:aicpu::SparseTensorDenseMatMulCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_mat_mul.cc:aicpu::SparseTensorDenseMatMulCpuKernel::regular_calculate
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sspaddmm.cc:aicpu::SspaddmmCpuKernel::ScalarSparseMul
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_tensor_dense_add.cc:aicpu::SparseTensorDenseAddCpuKernel::ValidateInputs
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_dense_cwise_utils.cc:aicpu::SparseDenseCwiseOpKernel<Op>::SparseDenseCwiseOpBcastCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_dense_cwise_utils.cc:aicpu::SparseDenseCwiseOpKernel<Op>::SparseDenseCwiseOpSpecialComputeComplex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_dense_cwise_utils.cc:aicpu::SparseDenseCwiseOpKernel<Op>::SparseDenseCwiseOpBcastComputeComplex

View File

@ -0,0 +1,209 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "next_after.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kNextAfter = "NextAfter";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
#define NEXTAFTER_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = NextAfterCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("NextAfter kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t NextAfterCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "NextAfter check input and output number failed.");
KERNEL_HANDLE_ERROR(NextAfterParamCheck(ctx), "NextAfter check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
NEXTAFTER_COMPUTE_CASE(DT_FLOAT, float, ctx)
NEXTAFTER_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("NextAfter kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t NextAfterCpuKernel::NextAfterParamCheck(CpuKernelContext &ctx) {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"NextAfterCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
/*
special compute is used in the following situations.
1. the shapes of input1 and input2 are the same
2. input1 is a 1D tensor with only one element or input1 is scalar
3. input2 is a 1D tensor with only one element or input2 is scalar
4. the shapes of input1 and input2 are different
*/
template <typename T>
void NextAfterCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
const T *input2, T *output) {
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = start; i < end; ++i) {
*(output + i) = nextafter(*(input1 + i), *(input2 + i));
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(output + i) = nextafter(*input1, *(input2 + i));
}
break;
case BcastShapeType::Y_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(output + i) = nextafter(*(input1 + i), *input2);
}
break;
default:
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
}
template <typename T>
uint32_t NextAfterCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type = in0_elements_nums == in1_elements_nums
? BcastShapeType::SAME_SHAPE
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_nextafter = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, in0, in1, out); };
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_nextafter),
"NextAfter Compute failed.");
} else {
SpecialCompute<T>(type, 0, data_num, in0, in1, out);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t NextAfterCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_nextafter = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
*(out + i) = nextafter(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_nextafter),
"NextAfter Compute failed.");
} else {
for (int64_t i = 0; i < data_num; ++i) {
*(out + i) = nextafter(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t NextAfterCpuKernel::NextAfterCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (isNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kNextAfter, NextAfterCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,48 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_NEXTAFTER_H_
#define AICPU_KERNELS_NORMALIZED_NEXTAFTER_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class NextAfterCpuKernel : public CpuKernel {
public:
NextAfterCpuKernel() = default;
~NextAfterCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t NextAfterParamCheck(CpuKernelContext &ctx);
template <typename T>
void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
uint32_t NextAfterCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,127 @@
/*
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "non_deterministic_ints.h"
#include <cmath>
#include <ctime>
#include <iostream>
#include <random>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
namespace {
const char *kNonDeterministicInts = "NonDeterministicInts";
const uint32_t kInputNum = 1;
const uint32_t kOutputNum = 1;
const uint32_t kInputDims = 1;
const uint32_t kInputSizes = 2;
constexpr int64_t kParallelDataNums = 7 * 1024;
} // namespace
namespace aicpu {
template <typename T1, typename T2>
uint32_t NonDeterministicIntsCpuKernel::DoCompute(CpuKernelContext &ctx) {
Tensor *input = ctx.Input(0);
Tensor *output = ctx.Output(0);
auto input_nums = input->NumElements();
auto input_data = reinterpret_cast<T2 *>(input->GetData());
auto output_data = reinterpret_cast<T1 *>(output->GetData());
auto output_nums = ctx.Output(0)->NumElements();
auto max_data = std::numeric_limits<T1>::max();
std::vector<int64_t> out_put_dims;
for (auto i = 0; i < input_nums; i++) {
if (*(input_data + i) <= 0) {
KERNEL_LOG_ERROR("Shape elements must be > 0.");
return KERNEL_STATUS_PARAM_INVALID;
}
out_put_dims.push_back(input_data[i]);
}
if (output_nums <= kParallelDataNums) {
std::default_random_engine seed(time(0));
std::uniform_int_distribution<T1> u(-max_data, max_data);
for (auto j = 0; j < output_nums; j++) {
*(output_data + j) = u(seed);
}
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > output_nums) {
max_core_num = output_nums;
}
auto shard_non_deterministic_ints = [&](int64_t start, int64_t end) {
std::default_random_engine seed(time(0));
std::uniform_int_distribution<T1> u(-max_data, max_data);
for (auto j = start; j < end; j++) {
*(output_data + j) = u(seed);
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, output_nums, output_nums / max_core_num, shard_non_deterministic_ints),
"NonDeterministicInts compute failed.");
}
output->GetTensorShape()->SetDimSizes(out_put_dims);
return KERNEL_STATUS_OK;
}
uint32_t NonDeterministicIntsCpuKernel::DataAndTypeCheck(CpuKernelContext &ctx) {
// the non null of input and output has been verified in NormalCheck
Tensor *input = ctx.Input(0);
auto input_data_nums = input->NumElements();
auto data_type = input->GetDataType();
KERNEL_CHECK_FALSE((data_type == DT_INT32 || data_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
" Input type must be one of int32 or int64.");
KERNEL_CHECK_FALSE((input_data_nums >= kInputSizes), KERNEL_STATUS_PARAM_INVALID, "Input data elements must >= 2.");
KERNEL_CHECK_FALSE((input->GetTensorShape()->GetDimSizes().size() == kInputDims), KERNEL_STATUS_PARAM_INVALID,
"Input tensor must be a 1-D tensor.");
return KERNEL_STATUS_OK;
}
uint32_t NonDeterministicIntsCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check params failed.", kNonDeterministicInts);
KERNEL_HANDLE_ERROR(DataAndTypeCheck(ctx), " data or type check failed.");
auto output_data_type = ctx.Output(0)->GetDataType();
auto input_data_type = ctx.Input(0)->GetDataType();
uint32_t ret = KERNEL_STATUS_OK;
switch (output_data_type) {
case DT_INT32: {
if (input_data_type == DT_INT32) {
ret = DoCompute<int32_t, int32_t>(ctx);
} else {
ret = DoCompute<int32_t, int64_t>(ctx);
}
break;
}
case DT_INT64: {
if (input_data_type == DT_INT32) {
ret = DoCompute<int64_t, int32_t>(ctx);
} else {
ret = DoCompute<int64_t, int64_t>(ctx);
}
break;
}
default: {
KERNEL_LOG_ERROR("NonDeterministicInts kernel data type [%s] not support.", DTypeStr(output_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
KERNEL_CHECK_FALSE((ret == KERNEL_STATUS_OK), ret, "Compute failed.");
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kNonDeterministicInts, NonDeterministicIntsCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,34 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_NONDETERMINISTICINTS_H_
#define AICPU_KERNELS_NORMALIZED_NONDETERMINISTICINTS_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class NonDeterministicIntsCpuKernel : public CpuKernel {
public:
NonDeterministicIntsCpuKernel() = default;
~NonDeterministicIntsCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
template <typename T1, typename T2>
static uint32_t DoCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,208 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "pow.h"
#include <math.h>
#include <stdint.h>
#include "Eigen/Dense"
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "utils/kernel_util.h"
#include "securec.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kPow = "Pow";
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
#define POW_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = PowCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Pow kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t PowCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Pow check input and output number failed.");
KERNEL_HANDLE_ERROR(PowParamCheck(ctx), "Pow check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
POW_COMPUTE_CASE(DT_INT8, int8_t, ctx)
POW_COMPUTE_CASE(DT_INT32, int32_t, ctx)
POW_COMPUTE_CASE(DT_INT64, int64_t, ctx)
POW_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
POW_COMPUTE_CASE(DT_FLOAT, float, ctx)
POW_COMPUTE_CASE(DT_DOUBLE, double, ctx)
POW_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
POW_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("Pow kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t PowCpuKernel::PowParamCheck(CpuKernelContext &ctx) {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
auto input0_Shape = input_0->GetTensorShape();
auto input1_Shape = input_1->GetTensorShape();
KERNEL_CHECK_NULLPTR(input0_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input0_Shape failed.")
KERNEL_CHECK_NULLPTR(input1_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input1_Shape failed.")
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"PowCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T>
void PowCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, T *input1, T *input2, T *output) {
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = start; i < end; ++i) {
*(output + i) = pow(*(input1 + i), *(input2 + i));
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(output + i) = pow(*(input1), *(input2 + i));
}
break;
default:
for (int64_t i = start; i < end; ++i) {
*(output + i) = pow(*(input1 + i), *(input2));
}
break;
}
}
template <typename T>
uint32_t PowCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type = in0_elements_nums == in1_elements_nums
? BcastShapeType::SAME_SHAPE
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_pow = [&](size_t start, size_t end) { SpecialCompute<T>(type, start, end, in0, in1, out); };
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_pow),
"Pow Compute failed.");
} else {
SpecialCompute<T>(type, 0, data_num, in0, in1, out);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PowCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_pow = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
auto input1 = in0 + bcast.GetBroadcastXIndex(i); // i-th value of input0
auto input2 = in1 + bcast.GetBroadcastYIndex(i); // i-th value of input1
*(out + i) = pow((*input1), (*input2));
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_pow),
"Pow Compute failed.");
} else {
for (int64_t i = 0; i < data_num; i++) {
auto input1 = in0 + bcast.GetBroadcastXIndex(i); // i-th value of input0
auto input2 = in1 + bcast.GetBroadcastYIndex(i); // i-th value of input1
*(out + i) = pow((*input1), (*input2));
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PowCpuKernel::PowCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (isNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
return BcastCompute<T>(ctx, bcast);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kPow, PowCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,45 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_POW_H_
#define AICPU_KERNELS_NORMALIZED_POW_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class PowCpuKernel : public CpuKernel {
public:
PowCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t PowParamCheck(CpuKernelContext &ctx);
template <typename T>
void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, T *input1, T *input2, T *output);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
uint32_t PowCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,100 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "real.h"
#include "Eigen/Eigen"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *kReal = "Real";
constexpr int64_t kFolatDataNums = 8 * 128 * 1024;
constexpr int64_t kDoubleDataNums = 16 * 128 * 1024;
#define Real_COMPUTE_CASE(IN_DTYPE, IN_TYPE, OUT_DTYPE, CTX) \
case (IN_DTYPE): { \
switch (OUT_DTYPE) { \
case (DT_FLOAT): { \
uint32_t result = RealCompute<IN_TYPE, float>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Real kernel compute failed."); \
return result; \
} \
break; \
} \
case (DT_DOUBLE): { \
uint32_t result = RealCompute<IN_TYPE, double>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Real kernel compute failed."); \
return result; \
} \
break; \
} \
default: \
KERNEL_LOG_ERROR("Real kernel output data type [%s] not support.", DTypeStr(OUT_DTYPE).c_str()); \
return KERNEL_STATUS_PARAM_INVALID; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t RealCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kReal);
DataType input_type = ctx.Input(0)->GetDataType();
switch (input_type) {
Real_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, DT_FLOAT, ctx)
Real_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, DT_DOUBLE, ctx) default
: KERNEL_LOG_ERROR("Real kernel input data type [%s] not support.", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T, typename t>
uint32_t RealCpuKernel::RealCompute(CpuKernelContext &ctx) {
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output = reinterpret_cast<t *>(ctx.Output(0)->GetData());
auto data_type = ctx.Input(0)->GetDataType();
int64_t data_num = ctx.Output(0)->NumElements();
int64_t data_size = data_num * sizeof(T);
if ((data_type == DT_COMPLEX64 && data_size <= kFolatDataNums) ||
(data_type == DT_COMPLEX128 && data_size <= kDoubleDataNums)) {
for (int64_t index = 0; index < data_num; ++index) {
*(output + index) = (*(input + index)).real();
}
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_real = [&](size_t start, size_t end) {
for (size_t index = start; index < end; ++index) {
*(output + index) = (*(input + index)).real();
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_real),
"real Compute failed");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kReal, RealCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright (C) 2020-2021. Huawei Technologies Co., Ltd. All rights reserved.
* This program is free software; you can redistribute it and/or modify
* it under the terms of the Apache License Version 2.0.You may not use this file except in compliance with the License.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Apache License for more details at
* http://www.apache.org/licenses/LICENSE-2.0
*
* @brief
*
* @version 1.0
*
*/
#ifndef AICPU_KERNELS_NORMALIZED_REAL_H_
#define AICPU_KERNELS_NORMALIZED_REAL_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class RealCpuKernel : public CpuKernel {
public:
RealCpuKernel() = default;
~RealCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t RealCheck(CpuKernelContext &ctx);
template <typename T, typename t>
uint32_t RealCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,312 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "resize_area.h"
#include <securec.h>
#include <vector>
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
#include "utils/sparse_tensor.h"
namespace {
constexpr uint32_t kInputNum = 2;
constexpr uint32_t kOutputNum = 1;
const int64_t kParallelDataNum = 1024 * 1024;
const char *kResizeArea = "ResizeArea";
#define RESIZEAREA_COMPUTE_CASE(DTYPE, CHANNEL, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = DoCompute<TYPE>(st, x_interps, CHANNEL, CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("ResizeArea kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
inline int64_t Bound(int64_t val, int64_t limit) { return std::min(limit - 1, std::max(int64_t{0}, val)); }
float Scaling_(size_t in_size, size_t out_size, bool align_corners) {
return (align_corners && out_size > 1) ? (in_size - 1) / static_cast<float>(out_size - 1)
: in_size / static_cast<float>(out_size);
}
uint32_t ResizeAreaCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
uint32_t res = GetInputAndCheck(ctx);
KERNEL_CHECK_FALSE(res == KERNEL_STATUS_OK, res, "GetInputAndCheck failed.");
ResizeAreaSt st;
st.CalSt(ctx);
// compute the weight of pixels in rows
std::vector<ResizeAreaCachedInterpolation> x_interps(st.out_width);
for (size_t x = 0; x < st.out_width; x++) {
auto &x_interp = x_interps[x];
const float transit_x0 = x * st.width_scale;
const float transit_x1 = (x + 1) * st.width_scale;
size_t v = std::floor(transit_x0);
x_interp.start = v;
x_interp.start_scale = (v + 1 > transit_x1 ? st.width_scale : v + 1 - transit_x0);
v = std::ceil(transit_x1);
x_interp.end = v;
v = x_interp.end - 1;
x_interp.end_minus_one_scale = (v + 1 > transit_x1 ? transit_x1 - v : 1.0);
}
auto channels_num = -1;
if (st.channels == 3) {
channels_num = 3;
}
switch (dtype_) {
RESIZEAREA_COMPUTE_CASE(DT_INT8, channels_num, int8_t, ctx)
RESIZEAREA_COMPUTE_CASE(DT_INT16, channels_num, int16_t, ctx)
RESIZEAREA_COMPUTE_CASE(DT_INT32, channels_num, int32_t, ctx)
RESIZEAREA_COMPUTE_CASE(DT_INT64, channels_num, int64_t, ctx)
RESIZEAREA_COMPUTE_CASE(DT_UINT8, channels_num, uint8_t, ctx)
RESIZEAREA_COMPUTE_CASE(DT_UINT16, channels_num, uint16_t, ctx)
RESIZEAREA_COMPUTE_CASE(DT_FLOAT, channels_num, float, ctx)
RESIZEAREA_COMPUTE_CASE(DT_FLOAT16, channels_num, Eigen::half, ctx)
RESIZEAREA_COMPUTE_CASE(DT_DOUBLE, channels_num, double, ctx)
default:
KERNEL_LOG_ERROR("ResizeArea doesn't support input tensor types: [%s]", DTypeStr(dtype_).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t ResizeAreaCpuKernel::DoCompute(const ResizeAreaSt &st, std::vector<ResizeAreaCachedInterpolation> &x_interps,
int64_t kKnownNumChannels, CpuKernelContext &ctx) {
auto input_ptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_ptr = reinterpret_cast<float *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Input(0)->NumElements();
float scale = 1.0 / (st.height_scale * st.width_scale);
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > st.out_height) {
max_core_num = st.out_height;
}
for (size_t b = 0; b < st.batch_size; ++b) {
auto shared_resize_area = [&](size_t start, size_t end) {
// compute the weight of pixels in columns
for (size_t y = start; y < end; ++y) {
const float transit_y0 = y * st.height_scale;
const float transit_y1 = (y + 1) * st.height_scale;
// The start and end height indices of all the cells that could
// contribute to the target cell.
const int64_t y_start = std::floor(transit_y0);
const int64_t y_end = std::ceil(transit_y1);
std::vector<float> y_scales;
std::vector<const T *> y_ptrs;
y_scales.clear();
y_ptrs.clear();
for (int64_t i = y_start; i < y_end; ++i) {
float scale_y;
if (i < transit_y0) {
scale_y = (i + 1 > transit_y1 ? st.height_scale : i + 1 - transit_y0);
} else {
scale_y = (i + 1 > transit_y1 ? transit_y1 - i : 1.0);
}
y_scales.push_back(scale_y);
y_ptrs.push_back(input_ptr + (b * st.in_height * st.in_width * st.channels +
Bound(i, st.in_height) * st.in_width * st.channels));
}
float *output_patch_ptr =
output_ptr + (b * st.out_height * st.out_width * st.channels + y * st.out_width * st.channels);
if (kKnownNumChannels == 3) {
for (size_t x = 0; x < st.out_width; ++x) {
const ResizeAreaCachedInterpolation &x_interp = x_interps[x];
if (x_interp.needs_bounding) {
ComputePatchSumOf3Channels<true>(scale, st, y_ptrs, y_scales, x_interp, output_patch_ptr);
} else {
ComputePatchSumOf3Channels<false>(scale, st, y_ptrs, y_scales, x_interp, output_patch_ptr);
}
output_patch_ptr += 3;
}
} else {
for (size_t x = 0; x < st.out_width; ++x) {
const ResizeAreaCachedInterpolation &x_interp = x_interps[x];
if (x_interp.needs_bounding) {
ComputePatchSum<true>(scale, st, y_ptrs, y_scales, x_interp, output_patch_ptr);
} else {
ComputePatchSum<false>(scale, st, y_ptrs, y_scales, x_interp, output_patch_ptr);
}
output_patch_ptr += st.channels;
}
}
}
};
CpuKernelUtils::ParallelFor(ctx, st.out_height, st.out_height / max_core_num, shared_resize_area);
}
} else {
std::vector<float> y_scales;
std::vector<const T *> y_ptrs;
for (size_t b = 0; b < st.batch_size; ++b) {
for (size_t y = 0; y < st.out_height; ++y) {
y_scales.clear();
y_ptrs.clear();
const float transit_y0 = y * st.height_scale;
const float transit_y1 = (y + 1) * st.height_scale;
// The start and end height indices of all the cells that could
// contribute to the target cell.
const size_t y_start = std::floor(transit_y0);
const size_t y_end = std::ceil(transit_y1);
for (size_t i = y_start; i < y_end; ++i) {
float scale_y;
if (i < transit_y0) {
scale_y = (i + 1 > transit_y1 ? st.height_scale : i + 1 - transit_y0);
} else {
scale_y = (i + 1 > transit_y1 ? transit_y1 - i : 1.0);
}
y_scales.push_back(scale_y);
y_ptrs.push_back(input_ptr + (b * st.in_height * st.in_width * st.channels +
Bound(i, st.in_height) * st.in_width * st.channels));
}
if (kKnownNumChannels == 3) {
for (size_t x = 0; x < st.out_width; ++x) {
const ResizeAreaCachedInterpolation &x_interp = x_interps[x];
if (x_interp.needs_bounding) {
ComputePatchSumOf3Channels<true>(scale, st, y_ptrs, y_scales, x_interp, output_ptr);
} else {
ComputePatchSumOf3Channels<false>(scale, st, y_ptrs, y_scales, x_interp, output_ptr);
}
output_ptr += 3;
}
} else {
for (size_t x = 0; x < st.out_width; ++x) {
const ResizeAreaCachedInterpolation &x_interp = x_interps[x];
if (x_interp.needs_bounding) {
ComputePatchSum<true>(scale, st, y_ptrs, y_scales, x_interp, output_ptr);
} else {
ComputePatchSum<false>(scale, st, y_ptrs, y_scales, x_interp, output_ptr);
}
output_ptr += st.channels;
}
}
}
}
}
return KERNEL_STATUS_OK;
}
// compute the value of the specific pxiel when the num of channels is 3
template <bool NeedsXBounding, typename T>
void ResizeAreaCpuKernel::ComputePatchSumOf3Channels(float scale, const ResizeAreaSt &st,
const std::vector<const T *> &y_ptrs,
const std::vector<float> &y_scales,
const ResizeAreaCachedInterpolation &x_interp,
float *&output_patch_ptr) {
#define BOUND_IF_NEEDED(x, y) (NeedsXBounding ? Bound(x, y) : (x))
float sum_0 = 0;
float sum_1 = 0;
float sum_2 = 0;
for (size_t i = 0; i < y_ptrs.size(); ++i) {
const T *ptr = y_ptrs[i];
float scale_x = x_interp.start_scale;
int64_t offset = 3 * BOUND_IF_NEEDED(x_interp.start, st.in_width);
float sum_y_0 = static_cast<float>(ptr[offset + 0]) * scale_x;
float sum_y_1 = static_cast<float>(ptr[offset + 1]) * scale_x;
float sum_y_2 = static_cast<float>(ptr[offset + 2]) * scale_x;
if (x_interp.start + 1 != x_interp.end) {
for (size_t x = x_interp.start + 1; x < x_interp.end - 1; ++x) {
int64_t offset = 3 * BOUND_IF_NEEDED(x, st.in_width);
sum_y_0 += static_cast<float>(ptr[offset + 0]);
sum_y_1 += static_cast<float>(ptr[offset + 1]);
sum_y_2 += static_cast<float>(ptr[offset + 2]);
}
scale_x = x_interp.end_minus_one_scale;
offset = 3 * BOUND_IF_NEEDED(x_interp.end - 1, st.in_width);
sum_y_0 += static_cast<float>(ptr[offset + 0]) * scale_x;
sum_y_1 += static_cast<float>(ptr[offset + 1]) * scale_x;
sum_y_2 += static_cast<float>(ptr[offset + 2]) * scale_x;
}
sum_0 += sum_y_0 * y_scales[i];
sum_1 += sum_y_1 * y_scales[i];
sum_2 += sum_y_2 * y_scales[i];
}
output_patch_ptr[0] = sum_0 * scale;
output_patch_ptr[1] = sum_1 * scale;
output_patch_ptr[2] = sum_2 * scale;
#undef BOUND_IF_NEEDED
}
// compute the value of the specific pxiel when the num of channels is not 3
template <bool NeedsXBounding, typename T>
void ResizeAreaCpuKernel::ComputePatchSum(float scale, const ResizeAreaSt &st, const std::vector<const T *> &y_ptrs,
const std::vector<float> &y_scales,
const ResizeAreaCachedInterpolation &x_interp, float *&output_patch_ptr) {
#define BOUND_IF_NEEDED(x, y) (NeedsXBounding ? Bound(x, y) : (x))
const auto num_channels = st.channels;
for (size_t c = 0; c < num_channels; ++c) {
float sum = 0;
for (size_t i = 0; i < y_ptrs.size(); ++i) {
const T *ptr = y_ptrs[i];
float scale_x = x_interp.start_scale;
float sum_y = static_cast<float>(ptr[num_channels * BOUND_IF_NEEDED(x_interp.start, st.in_width) + c]) * scale_x;
if (x_interp.start + 1 != x_interp.end) {
for (size_t x = x_interp.start + 1; x < x_interp.end - 1; ++x) {
sum_y += static_cast<float>(ptr[num_channels * BOUND_IF_NEEDED(x, st.in_width) + c]);
}
scale_x = x_interp.end_minus_one_scale;
sum_y += static_cast<float>(ptr[num_channels * BOUND_IF_NEEDED(x_interp.end - 1, st.in_width) + c]) * scale_x;
}
sum += sum_y * y_scales[i];
}
output_patch_ptr[c] = sum * scale;
}
#undef BOUND_IF_NEEDED
}
// check params
uint32_t ResizeAreaCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "ResizeArea check input and output number failed.");
Tensor *input_tensor1 = ctx.Input(0);
Tensor *input_tensor2 = ctx.Input(1);
Tensor *output_tensor = ctx.Output(0);
auto outsize = reinterpret_cast<int32_t *>(input_tensor2->GetData());
int32_t out_height = static_cast<int32_t>(outsize[0]);
int32_t out_width = static_cast<int32_t>(outsize[1]);
in_shape1 = input_tensor1->GetTensorShape()->GetDimSizes();
in_shape2 = input_tensor2->GetTensorShape()->GetDimSizes();
out_shape = output_tensor->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE(in_shape1.size() == 4, KERNEL_STATUS_PARAM_INVALID, "Dim of input[0] must be 4,but got[%zu].",
in_shape1.size());
KERNEL_CHECK_FALSE(in_shape2.size() == 1, KERNEL_STATUS_PARAM_INVALID, "Dim of input[1] must be 1,but got[%zu].",
in_shape2.size());
KERNEL_CHECK_FALSE(out_shape.size() == 4, KERNEL_STATUS_PARAM_INVALID, "Dim of output[0] must be 4,but got[%zu].",
out_shape.size());
KERNEL_CHECK_FALSE(out_height > 0 && out_width > 0, KERNEL_STATUS_PARAM_INVALID, "outsize must be positive.");
AttrValue *attr_align_corners = ctx.GetAttr("align_corners");
align_corners = (attr_align_corners == nullptr) ? false : (attr_align_corners->GetBool());
dtype_ = input_tensor1->GetDataType();
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kResizeArea, ResizeAreaCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,88 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_RESIZE_AREA_H_
#define AICPU_KERNELS_NORMALIZED_RESIZE_AREA_H_
#include <string>
#include "Eigen/Core"
#include "cpu_ops_kernel.h"
namespace aicpu {
std::vector<int64_t> in_shape1;
std::vector<int64_t> in_shape2;
std::vector<int64_t> out_shape;
bool align_corners = false;
// weight data of every pixel
struct ResizeAreaCachedInterpolation {
size_t start;
size_t end;
float start_scale;
float end_minus_one_scale;
bool needs_bounding = true;
};
inline int64_t Bound(int64_t val, int64_t limit);
float Scaling_(size_t in_size, size_t out_size, bool align_corners);
struct ResizeAreaSt {
void CalSt(CpuKernelContext &ctx) {
Tensor *input_tensor1 = ctx.Input(0);
Tensor *input_tensor2 = ctx.Input(1);
in_shape1 = input_tensor1->GetTensorShape()->GetDimSizes();
auto outsize = reinterpret_cast<int32_t *>(input_tensor2->GetData());
batch_size = in_shape1[0];
channels = in_shape1[3];
in_height = in_shape1[1];
in_width = in_shape1[2];
out_height = outsize[0];
out_width = outsize[1];
height_scale = Scaling_(in_height, out_height, align_corners);
width_scale = Scaling_(in_width, out_width, align_corners);
}
size_t batch_size;
size_t channels;
size_t in_height;
size_t in_width;
size_t out_height;
size_t out_width;
float height_scale;
float width_scale;
};
class ResizeAreaCpuKernel : public CpuKernel {
public:
~ResizeAreaCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DoCompute(const ResizeAreaSt &st, std::vector<ResizeAreaCachedInterpolation> &x_interps,
int64_t kKnownNumChannels, CpuKernelContext &ctx);
template <bool NeedsXBounding, typename T>
void ComputePatchSumOf3Channels(float scale, const ResizeAreaSt &st, const std::vector<const T *> &y_ptrs,
const std::vector<float> &y_scales, const ResizeAreaCachedInterpolation &x_interp,
float *&output_patch_ptr);
template <bool NeedsXBounding, typename T>
void ComputePatchSum(float scale, const ResizeAreaSt &st, const std::vector<const T *> &y_ptrs,
const std::vector<float> &y_scales, const ResizeAreaCachedInterpolation &x_interp,
float *&output_patch_ptr);
uint32_t GetInputAndCheck(CpuKernelContext &ctx);
DataType dtype_ = DT_INT8;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,337 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "segment_mean.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
const int64_t kParallelDataNum = 2 * 1024;
const char *kSegmentMean = "SegmentMean";
#define SEGMENTMEAN_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = SegmentMeanCompute<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SegmentMean kernel compute failed."); \
return result; \
} \
break; \
}
#define SEGMENTMEAN_COMPUTE_CASE_Complex(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = SegmentMeanCompute_Complex<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SegmentMean kernel compute failed."); \
return result; \
} \
break; \
}
#define SEGMENTMEAN_COMPUTE_CASE_ALL(TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE_Complex(DT_COMPLEX64, std::complex<float>, TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE_Complex(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) SEGMENTMEAN_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) SEGMENTMEAN_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
SEGMENTMEAN_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
} // namespace
namespace aicpu {
template <typename T>
T ComplexDiv(T sum, int64_t num) {
T res;
auto real = sum.real();
auto imag = sum.imag();
res.real(real / num);
res.imag(imag / num);
return res;
}
uint32_t SegmentMeanCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SegmentMean check input and output number failed.");
Tensor *input_data = ctx.Input(0);
KERNEL_CHECK_NULLPTR(input_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[0] failed.")
Tensor *segment_ids_data = ctx.Input(1);
KERNEL_CHECK_NULLPTR(segment_ids_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[1] failed.")
Tensor *output_data = ctx.Output(0);
KERNEL_CHECK_NULLPTR(output_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output[0] failed.");
auto data_type = ctx.Input(0)->GetDataType();
auto segment_ids_type = ctx.Input(1)->GetDataType();
switch (segment_ids_type) {
case DT_INT32: {
switch (data_type) {
SEGMENTMEAN_COMPUTE_CASE_ALL(int32_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
}
case DT_INT64: {
switch (data_type) {
SEGMENTMEAN_COMPUTE_CASE_ALL(int64_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
}
default: {
KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(segment_ids_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t SegmentMeanCpuKernel::SegmentMeanCompute(CpuKernelContext &ctx) {
Tensor *input_data = ctx.Input(0);
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
int64_t input_data_num = input_data->NumElements();
Tensor *segment_ids_data = ctx.Input(1);
auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
int64_t segment_ids_data_num = segment_ids_data->NumElements();
Tensor *output_data = ctx.Output(0);
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
int64_t output_data_num = output_data->NumElements();
for (int64_t i = 0; i < output_data_num; i++) {
output_data_addr[i] = static_cast<T1>(0);
}
std::vector<int64_t> segments;
if (segment_ids_data_num != (input_data->GetTensorShape()->GetDimSize(0))) {
KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_addr[0] < 0) {
KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t seg_tmp = 1;
for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
seg_tmp++;
} else {
segments.push_back(seg_tmp);
seg_tmp = 1;
}
if (i == segment_ids_data_num - 2) {
segments.push_back(seg_tmp);
}
}
const int64_t num_compare_per = input_data_num / (input_data->GetTensorShape()->GetDimSize(0));
const int64_t num_segments = segments.size();
if (num_segments < kParallelDataNum) {
for (int64_t i = 0; i < num_segments; i++) {
int64_t count = segments[i];
int64_t count_no = 0;
for (int64_t j = 0; j < i; j++) {
count_no += segments[j];
}
int64_t input_addr_base = count_no * num_compare_per;
if (num_compare_per < kParallelDataNum) {
for (int64_t j = 0; j < num_compare_per; j++) {
int64_t mean_init_addr = input_addr_base + j;
T1 sum_value = input_data_addr[mean_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = mean_init_addr + k * num_compare_per;
sum_value += input_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value / count;
}
} else {
uint32_t min_core_num = 1;
int64_t mean_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (mean_core_num > num_compare_per) {
mean_core_num = num_compare_per;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t j = start; j < end; j++) {
int64_t mean_init_addr = input_addr_base + j;
T1 sum_value = input_data_addr[mean_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = mean_init_addr + k * num_compare_per;
sum_value += input_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value / count;
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / mean_core_num, shard_compute),
"SegmentMean Compute failed.");
}
}
} else {
uint32_t min_core_num_seg = 1;
int64_t mean_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (mean_core_num_seg > num_segments) {
mean_core_num_seg = num_segments;
}
auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
for (size_t i = start_seg; i < end_seg; i++) {
int64_t count = segments[i];
int64_t count_no = 0;
for (size_t j = 0; j < i; j++) {
count_no += segments[j];
}
int64_t input_addr_base = count_no * num_compare_per;
for (int64_t j = 0; j < num_compare_per; j++) {
int64_t mean_init_addr = input_addr_base + j;
T1 sum_value = input_data_addr[mean_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = mean_init_addr + k * num_compare_per;
sum_value += input_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value / count;
}
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, num_segments, num_segments / mean_core_num_seg, shard_compute_seg),
"SegmentMean Compute failed.");
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t SegmentMeanCpuKernel::SegmentMeanCompute_Complex(CpuKernelContext &ctx) {
Tensor *input_data = ctx.Input(0);
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
int64_t input_data_num = input_data->NumElements();
Tensor *segment_ids_data = ctx.Input(1);
auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
int64_t segment_ids_data_num = segment_ids_data->NumElements();
Tensor *output_data = ctx.Output(0);
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
int64_t output_data_num = output_data->NumElements();
for (int64_t i = 0; i < output_data_num; i++) {
output_data_addr[i] = static_cast<T1>(0);
}
std::vector<int64_t> segments;
if (segment_ids_data_num != (input_data->GetTensorShape()->GetDimSize(0))) {
KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_addr[0] < 0) {
KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t seg_tmp = 1;
for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
seg_tmp++;
} else {
segments.push_back(seg_tmp);
seg_tmp = 1;
}
if (i == segment_ids_data_num - 2) {
segments.push_back(seg_tmp);
}
}
const int64_t num_compare_per = input_data_num / (input_data->GetTensorShape()->GetDimSize(0));
const int64_t num_segments = segments.size();
if (num_segments < kParallelDataNum) {
for (int64_t i = 0; i < num_segments; i++) {
int64_t count = segments[i];
int64_t count_no = 0;
for (int64_t j = 0; j < i; j++) {
count_no += segments[j];
}
int64_t input_addr_base = count_no * num_compare_per;
if (num_compare_per < kParallelDataNum) {
for (int64_t j = 0; j < num_compare_per; j++) {
int64_t mean_init_addr = input_addr_base + j;
T1 sum_value = input_data_addr[mean_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = mean_init_addr + k * num_compare_per;
sum_value += input_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = ComplexDiv(sum_value, count);
}
} else {
uint32_t min_core_num = 1;
int64_t mean_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (mean_core_num > num_compare_per) {
mean_core_num = num_compare_per;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t j = start; j < end; j++) {
int64_t mean_init_addr = input_addr_base + j;
T1 sum_value = input_data_addr[mean_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = mean_init_addr + k * num_compare_per;
sum_value += input_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = ComplexDiv(sum_value, count);
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / mean_core_num, shard_compute),
"SegmentMean Compute failed.");
}
}
} else {
uint32_t min_core_num_seg = 1;
int64_t mean_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (mean_core_num_seg > num_segments) {
mean_core_num_seg = num_segments;
}
auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
for (size_t i = start_seg; i < end_seg; i++) {
int64_t count = segments[i];
int64_t count_no = 0;
for (size_t j = 0; j < i; j++) {
count_no += segments[j];
}
int64_t input_addr_base = count_no * num_compare_per;
for (int64_t j = 0; j < num_compare_per; j++) {
int64_t mean_init_addr = input_addr_base + j;
T1 sum_value = input_data_addr[mean_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = mean_init_addr + k * num_compare_per;
sum_value += input_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = ComplexDiv(sum_value, count);
}
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, num_segments, num_segments / mean_core_num_seg, shard_compute_seg),
"SegmentMean Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSegmentMean, SegmentMeanCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SEGMENTMEAN_H_
#define AICPU_KERNELS_NORMALIZED_SEGMENTMEAN_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SegmentMeanCpuKernel : public CpuKernel {
public:
SegmentMeanCpuKernel() = default;
~SegmentMeanCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2>
static uint32_t SegmentMeanCompute(CpuKernelContext &ctx);
template <typename T1, typename T2>
static uint32_t SegmentMeanCompute_Complex(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,243 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "segment_min.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include <iostream>
#include <vector>
#include <unordered_map>
using namespace std;
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kSegmentMin = "SegmentMin";
#define SEGMENT_MIN_COMPUTE_CASE(DTYPE, TYPE, CTX, STYPE) \
case (DTYPE): { \
uint32_t res; \
switch (STYPE) { \
case DT_INT32: \
res = SegmentMinCompute<TYPE, int32_t>(CTX); \
break; \
case DT_INT64: \
res = SegmentMinCompute<TYPE, int64_t>(CTX); \
break; \
default: \
KERNEL_LOG_ERROR("SegmentMin kernel segment_ids type [%s] not support.", DTypeStr(STYPE).c_str()); \
return KERNEL_STATUS_PARAM_INVALID; \
} \
if (res != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SegmentMin kernel compute failed."); \
return res; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t SegmentMinCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SegmentMin check input and output number failed.");
KERNEL_HANDLE_ERROR(SegmentMinCheck(ctx), "SegmentMin check params failed.");
auto type_data = ctx.Input(0)->GetDataType();
auto type_seg = ctx.Input(1)->GetDataType();
switch (type_data) {
SEGMENT_MIN_COMPUTE_CASE(DT_INT8, int8_t, ctx, type_seg)
SEGMENT_MIN_COMPUTE_CASE(DT_INT16, int16_t, ctx, type_seg)
SEGMENT_MIN_COMPUTE_CASE(DT_INT32, int32_t, ctx, type_seg)
SEGMENT_MIN_COMPUTE_CASE(DT_INT64, int64_t, ctx, type_seg)
SEGMENT_MIN_COMPUTE_CASE(DT_UINT8, uint8_t, ctx, type_seg)
SEGMENT_MIN_COMPUTE_CASE(DT_UINT32, uint32_t, ctx, type_seg)
SEGMENT_MIN_COMPUTE_CASE(DT_UINT16, uint16_t, ctx, type_seg)
SEGMENT_MIN_COMPUTE_CASE(DT_UINT64, uint64_t, ctx, type_seg)
SEGMENT_MIN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx, type_seg)
SEGMENT_MIN_COMPUTE_CASE(DT_FLOAT, float, ctx, type_seg)
SEGMENT_MIN_COMPUTE_CASE(DT_DOUBLE, double, ctx, type_seg)
default:
KERNEL_LOG_ERROR("SegmentMin kernel data type [%s] not support.", DTypeStr(type_data).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <class T1, class T2>
uint32_t SegmentMinCpuKernel::SegmentMinCompute(CpuKernelContext &ctx) {
auto data = ctx.Input(0); // tensor*
auto segment_ids = ctx.Input(1);
auto output = ctx.Output(0);
auto data_data = reinterpret_cast<T1 *>(data->GetData());
auto segment_ids_data = reinterpret_cast<T2 *>(segment_ids->GetData());
auto segment_ids_len = segment_ids->NumElements();
auto data_len = data->NumElements();
auto data_shape = data->GetTensorShape();
auto segment_ids_shape = segment_ids->GetTensorShape();
auto output_data = reinterpret_cast<T1 *>(output->GetData());
uint64_t output_len = output->NumElements();
uint64_t len2 = data_len / data_shape->GetDimSize(0);
uint64_t _8k = 8 * 1024, _2k = 2 * 1024;
// 输出初始化为0
if (output_len <= _8k) {
for (uint64_t i = 0; i < output_len; i++) output_data[i] = (T1)0;
} else {
uint32_t min_core = 1;
uint64_t max_core = std::max(min_core, aicpu::CpuKernelUtils::GetCPUNum(ctx) / 2);
if (max_core > output_len) {
max_core = output_len;
}
auto init = [&](size_t start, size_t end) {
for (auto i = start; i < end; i++) output_data[i] = (T1)0;
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, output_len, output_len / max_core, init),
"Initialize value of output failed.");
}
vector<T2> nums;
vector<pair<uint64_t, uint64_t>> ranges;
for (int64_t i = 0; i < segment_ids_len; ++i) {
if (i) {
if (segment_ids_data[i] == nums.back()) {
++ranges.back().second;
} else {
nums.push_back(segment_ids_data[i]), ranges.push_back({i, i});
}
} else {
nums.push_back(segment_ids_data[0]), ranges.push_back(make_pair(0, 0));
}
}
uint64_t nums_len = nums.size();
if (nums_len > _8k) {
uint32_t min_core = 1;
uint64_t max_core = std::max(min_core, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
max_core = std::min(max_core, nums_len);
auto mt_for_nums = [&](size_t start_num, size_t end_num) {
for (auto i = start_num; i < end_num; ++i) {
uint64_t st = ranges[i].first, ed = ranges[i].second;
uint64_t output_start = nums[i] * len2;
for (uint64_t k = 0; k < len2; k++) {
for (uint64_t j = st; j <= ed; j++) {
uint64_t data_start = j * len2;
T1 &u = output_data[output_start + k], &v = data_data[data_start + k];
if (j == st)
u = v;
else
u = std::min(u, v);
}
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, nums_len, nums_len / max_core, mt_for_nums),
"SegmentMin Compute failed.");
} else {
for (uint64_t i = 0; i < nums_len; ++i) {
uint64_t st = ranges[i].first, ed = ranges[i].second;
uint64_t output_start = nums[i] * len2;
if (len2 < _2k) {
for (uint64_t k = 0; k < len2; k++) {
for (uint64_t j = st; j <= ed; j++) {
uint64_t data_start = j * len2;
T1 &u = output_data[output_start + k], &v = data_data[data_start + k];
if (j == st) {
u = v;
} else {
u = std::min(u, v);
}
}
}
} else {
uint32_t min_core = 1;
uint64_t max_core = std::max(min_core, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
max_core = std::min(max_core, len2);
auto mt_for_len2 = [&](size_t start_len, size_t end_len) {
for (uint64_t k = start_len; k < end_len; k++) {
for (uint64_t j = st; j <= ed; j++) {
uint64_t data_start = j * len2;
T1 &u = output_data[output_start + k], &v = data_data[data_start + k];
if (j == st) {
u = v;
} else {
u = std::min(u, v);
}
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, len2, len2 / max_core, mt_for_len2),
"SegmentMin Compute failed.");
}
}
}
return KERNEL_STATUS_OK;
}
uint32_t SegmentMinCpuKernel::SegmentMinCheck(CpuKernelContext &ctx) {
// inspect the input & output pointer
KERNEL_CHECK_NULLPTR(ctx.Input(0), KERNEL_STATUS_PARAM_INVALID, "Get input 0 failed.")
KERNEL_CHECK_NULLPTR(ctx.Input(1), KERNEL_STATUS_PARAM_INVALID, "Get input 1 failed.")
KERNEL_CHECK_NULLPTR(ctx.Output(0), KERNEL_STATUS_PARAM_INVALID, "Get output failed.")
// inspect data in input & output
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
KERNEL_CHECK_NULLPTR(ctx.Input(1)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed.")
// regular test
KERNEL_CHECK_FALSE(CheckType(ctx.Input(1)), KERNEL_STATUS_PARAM_INVALID,
"The data type of segment_ids should be DT_INT32 or DT_INT64.")
KERNEL_CHECK_FALSE(CheckDim(ctx.Input(1)), KERNEL_STATUS_PARAM_INVALID, "The dimension of segment_ids should be 1.")
KERNEL_CHECK_FALSE(CheckSorted(ctx.Input(1)), KERNEL_STATUS_PARAM_INVALID,
"segment_ids should be ascending and no negative number in it.")
KERNEL_CHECK_FALSE(CheckLength(ctx.Input(1), ctx.Input(0)), KERNEL_STATUS_PARAM_INVALID,
"The length of segment_ids should be equal to the length "
"of the first dimension of the data")
KERNEL_LOG_DEBUG(
"SegmentMinCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_OK;
}
bool SegmentMinCpuKernel::CheckType(Tensor *t) {
DataType type = t->GetDataType();
return type == DT_INT32 || type == DT_INT64;
}
bool SegmentMinCpuKernel::CheckDim(Tensor *t) {
auto dims = t->GetTensorShape()->GetDims();
return dims == 1;
}
bool SegmentMinCpuKernel::CheckSorted(Tensor *tensor) {
DataType type = tensor->GetDataType();
auto len = tensor->NumElements();
switch (type) {
case DT_INT32: {
auto data = reinterpret_cast<int32_t *>(tensor->GetData());
for (int64_t i = 0; i < len; i++)
if ((i && data[i] < data[i - 1]) || data[i] < 0) {
return false;
}
break;
}
case DT_INT64: {
auto data = reinterpret_cast<int64_t *>(tensor->GetData());
for (int64_t i = 0; i < len; i++)
if ((i && data[i] < data[i - 1]) || data[i] < 0) {
return false;
}
break;
}
default:
return true;
}
return true;
}
bool SegmentMinCpuKernel::CheckLength(Tensor *seg, Tensor *data) {
auto len1 = seg->NumElements();
auto len2 = data->GetTensorShape()->GetDimSize(0);
return len1 == len2;
}
REGISTER_CPU_KERNEL(kSegmentMin, SegmentMinCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,41 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SEGMENT_MIN_H_
#define AICPU_KERNELS_NORMALIZED_SEGMENT_MIN_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SegmentMinCpuKernel : public CpuKernel {
public:
SegmentMinCpuKernel() = default;
~SegmentMinCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <class T1, class T2>
static uint32_t SegmentMinCompute(CpuKernelContext &ctx);
static uint32_t SegmentMinCheck(CpuKernelContext &ctx);
static bool CheckType(Tensor *t);
static bool CheckDim(Tensor *t);
static bool CheckSorted(Tensor *t);
static bool CheckLength(Tensor *seg, Tensor *data);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,342 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "segment_prod.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
const char *kSegmentProd = "SegmentProd";
#define SEGMENTPROD_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = SegmentProdCompute<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SegmentProd kernel compute failed."); \
return result; \
} \
break; \
}
#define SEGMENTPROD_COMPUTE_CASE_CP(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = SegmentProdCompute_Complex<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SegmentProd kernel compute failed."); \
return result; \
} \
break; \
}
#define SEGMENTPROD_COMPUTE_CASE_ALL(TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE_CP(DT_COMPLEX64, std::complex<float>, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE_CP(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
SEGMENTPROD_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
} // namespace
namespace aicpu {
uint32_t SegmentProdCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SegmentProd check input and output number failed.");
Tensor *input_data = ctx.Input(0);
KERNEL_CHECK_NULLPTR(input_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[0] failed.")
Tensor *segment_ids_data = ctx.Input(1);
KERNEL_CHECK_NULLPTR(segment_ids_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[1] failed.")
Tensor *output_data = ctx.Output(0);
KERNEL_CHECK_NULLPTR(output_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output[0] failed.");
auto data_type = ctx.Input(0)->GetDataType();
auto segment_ids_type = ctx.Input(1)->GetDataType();
switch (segment_ids_type) {
case DT_INT32: {
switch (data_type) {
SEGMENTPROD_COMPUTE_CASE_ALL(int32_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
}
case DT_INT64: {
switch (data_type) {
SEGMENTPROD_COMPUTE_CASE_ALL(int64_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
}
default: {
KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(segment_ids_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
T SegmentProdCpuKernel::ComputeMul(T num_1, T num_2) {
T res;
auto a = num_1.real();
auto b = num_1.imag();
auto x = num_2.real();
auto y = num_2.imag();
auto real_res = a * x - b * y;
auto imag_res = b * x + a * y;
res.real(real_res);
res.imag(imag_res);
return res;
}
template <typename T1, typename T2>
uint32_t SegmentProdCpuKernel::SegmentProdCompute(CpuKernelContext &ctx) {
Tensor *input_data = ctx.Input(0);
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
int64_t input_data_num = input_data->NumElements();
Tensor *segment_ids_data = ctx.Input(1);
auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
int64_t segment_ids_data_num = segment_ids_data->NumElements();
Tensor *output_data = ctx.Output(0);
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
int64_t output_data_num = output_data->NumElements();
for (int64_t i = 0; i < output_data_num; i++) {
output_data_addr[i] = static_cast<T1>(1);
}
std::vector<int64_t> segments;
if (segment_ids_data_num != (input_data->GetTensorShape()->GetDimSize(0))) {
KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_addr[0] < 0) {
KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t seg_tmp = 1;
for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
seg_tmp++;
} else {
segments.push_back(seg_tmp);
seg_tmp = 1;
}
if (i == segment_ids_data_num - 2) {
segments.push_back(seg_tmp);
}
}
const int64_t num_compare_per = input_data_num / (input_data->GetTensorShape()->GetDimSize(0));
const int64_t num_segments = segments.size();
if (num_segments < 2 * 1024) {
for (int64_t i = 0; i < num_segments; i++) {
int64_t count = segments[i];
int64_t count_no = 0;
for (int64_t j = 0; j < i; j++) {
count_no += segments[j];
}
int64_t input_addr_base = count_no * num_compare_per;
if (num_compare_per < 2 * 1024) {
for (int64_t j = 0; j < num_compare_per; j++) {
int64_t prod_init_addr = input_addr_base + j;
T1 prod_value = input_data_addr[prod_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = prod_init_addr + k * num_compare_per;
prod_value *= input_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
}
} else {
uint32_t min_core_num = 1;
int64_t prod_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (prod_core_num > num_compare_per) {
prod_core_num = num_compare_per;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t j = start; j < end; j++) {
int64_t prod_init_addr = input_addr_base + j;
T1 prod_value = input_data_addr[prod_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = prod_init_addr + k * num_compare_per;
prod_value *= input_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / prod_core_num, shard_compute),
"SegmentProd Compute failed.");
}
}
} else {
uint32_t min_core_num_seg = 1;
int64_t prod_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (prod_core_num_seg > num_segments) {
prod_core_num_seg = num_segments;
}
auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
for (size_t i = start_seg; i < end_seg; i++) {
int64_t count = segments[i];
int64_t count_no = 0;
for (size_t j = 0; j < i; j++) {
count_no += segments[j];
}
int64_t input_addr_base = count_no * num_compare_per;
for (int64_t j = 0; j < num_compare_per; j++) {
int64_t prod_init_addr = input_addr_base + j;
T1 prod_value = input_data_addr[prod_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = prod_init_addr + k * num_compare_per;
prod_value *= input_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
}
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, num_segments, num_segments / prod_core_num_seg, shard_compute_seg),
"SegmentProd Compute failed.");
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t SegmentProdCpuKernel::SegmentProdCompute_Complex(CpuKernelContext &ctx) {
Tensor *input_data = ctx.Input(0);
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
int64_t input_data_num = input_data->NumElements();
Tensor *segment_ids_data = ctx.Input(1);
auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
int64_t segment_ids_data_num = segment_ids_data->NumElements();
Tensor *output_data = ctx.Output(0);
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
int64_t output_data_num = output_data->NumElements();
for (int64_t i = 0; i < output_data_num; i++) {
output_data_addr[i] = static_cast<T1>(1);
}
std::vector<int64_t> segments;
if (segment_ids_data_num != (input_data->GetTensorShape()->GetDimSize(0))) {
KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_addr[0] < 0) {
KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t seg_tmp = 1;
for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
seg_tmp++;
} else {
segments.push_back(seg_tmp);
seg_tmp = 1;
}
if (i == segment_ids_data_num - 2) {
segments.push_back(seg_tmp);
}
}
const int64_t num_compare_per = input_data_num / (input_data->GetTensorShape()->GetDimSize(0));
const int64_t num_segments = segments.size();
if (num_segments < 2 * 1024) {
for (int64_t i = 0; i < num_segments; i++) {
int64_t count = segments[i];
int64_t count_no = 0;
for (int64_t j = 0; j < i; j++) {
count_no += segments[j];
}
int64_t input_addr_base = count_no * num_compare_per;
if (num_compare_per < 2 * 1024) {
for (int64_t j = 0; j < num_compare_per; j++) {
int64_t prod_init_addr = input_addr_base + j;
T1 prod_value = input_data_addr[prod_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = prod_init_addr + k * num_compare_per;
prod_value = ComputeMul(prod_value, input_data_addr[cmp_addr]);
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
}
} else {
uint32_t min_core_num = 1;
int64_t prod_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (prod_core_num > num_compare_per) {
prod_core_num = num_compare_per;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t j = start; j < end; j++) {
int64_t prod_init_addr = input_addr_base + j;
T1 prod_value = input_data_addr[prod_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = prod_init_addr + k * num_compare_per;
prod_value = ComputeMul(prod_value, input_data_addr[cmp_addr]);
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / prod_core_num, shard_compute),
"SegmentProd Compute failed.");
}
}
} else {
uint32_t min_core_num_seg = 1;
int64_t prod_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (prod_core_num_seg > num_segments) {
prod_core_num_seg = num_segments;
}
auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
for (size_t i = start_seg; i < end_seg; i++) {
int64_t count = segments[i];
int64_t count_no = 0;
for (size_t j = 0; j < i; j++) {
count_no += segments[j];
}
int64_t input_addr_base = count_no * num_compare_per;
for (int64_t j = 0; j < num_compare_per; j++) {
int64_t prod_init_addr = input_addr_base + j;
T1 prod_value = input_data_addr[prod_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = prod_init_addr + k * num_compare_per;
prod_value = ComputeMul(prod_value, input_data_addr[cmp_addr]);
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = prod_value;
}
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, num_segments, num_segments / prod_core_num_seg, shard_compute_seg),
"SegmentProd Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSegmentProd, SegmentProdCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,39 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SEGMENTPROD_H_
#define AICPU_KERNELS_NORMALIZED_SEGMENTPROD_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SegmentProdCpuKernel : public CpuKernel {
public:
SegmentProdCpuKernel() = default;
~SegmentProdCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
static T ComputeMul(T num_1, T num_2);
template <typename T1, typename T2>
static uint32_t SegmentProdCompute(CpuKernelContext &ctx);
template <typename T1, typename T2>
static uint32_t SegmentProdCompute_Complex(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,212 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "segment_sum.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
const char *kSegmentSum = "SegmentSum";
const int64_t kDataSize = 2 * 1024;
#define SEGMENTSUM_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = SegmentSumCompute<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SegmentSum kernel compute failed."); \
return result; \
} \
break; \
}
#define SEGMENTSUM_COMPUTE_CASE_ALL(TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
SEGMENTSUM_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
} // namespace
namespace aicpu {
uint32_t SegmentSumCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SegmentSum check input and output number failed.");
auto data_type = ctx.Input(0)->GetDataType();
auto segment_ids_type = ctx.Input(1)->GetDataType();
switch (segment_ids_type) {
case DT_INT32: {
switch (data_type) {
SEGMENTSUM_COMPUTE_CASE_ALL(int32_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
}
case DT_INT64: {
switch (data_type) {
SEGMENTSUM_COMPUTE_CASE_ALL(int64_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
}
default: {
KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(segment_ids_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t SegmentSumCpuKernel::SegmentSumCompute(CpuKernelContext &ctx) {
Tensor *input_x_data = ctx.Input(0);
auto input_x_data_addr = reinterpret_cast<T1 *>(input_x_data->GetData());
auto input_x_shape = input_x_data->GetTensorShape();
auto input_x_dims = input_x_shape->GetDimSizes();
int64_t input_x_data_num = input_x_data->NumElements();
Tensor *segment_ids_data = ctx.Input(1);
auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
int64_t segment_ids_data_num = segment_ids_data->NumElements();
input_x_dims[0] = segment_ids_data_addr[segment_ids_data_num - 1] + 1;
Tensor *output_data = ctx.Output(0);
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
auto output_data_shape = output_data->GetTensorShape();
if (output_data_shape->GetDimSize(0) < input_x_dims[0]) {
KERNEL_LOG_ERROR("The number of segments of the segmentation result of segment_ids is too large.");
return KERNEL_STATUS_PARAM_INVALID;
}
output_data_shape->SetDimSizes(input_x_dims);
if (!output_data->SetTensorShape(output_data_shape.get())) {
KERNEL_LOG_ERROR("Set output shape failed.");
return KERNEL_STATUS_INNER_ERROR;
}
int64_t output_data_num = output_data->NumElements();
for (int64_t i = 0; i < output_data_num; i++) {
output_data_addr[i] = static_cast<T1>(0);
}
std::vector<int64_t> segments;
if (segment_ids_data_num != (input_x_data->GetTensorShape()->GetDimSize(0))) {
KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_addr[0] < 0) {
KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t seg_tmp = 1;
for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
seg_tmp++;
} else {
segments.push_back(seg_tmp);
seg_tmp = 1;
}
if (i == segment_ids_data_num - 2) {
segments.push_back(seg_tmp);
}
}
const int64_t num_compare_per = input_x_data_num / (input_x_shape->GetDimSize(0));
const int64_t num_segments = segments.size();
if (num_segments < kDataSize) {
for (int64_t i = 0; i < num_segments; i++) {
int64_t count = segments[i];
int64_t count_no = 0;
for (int64_t j = 0; j < i; j++) {
count_no += segments[j];
}
int64_t input_addr_base = count_no * num_compare_per;
if (num_compare_per < 2 * 1024) {
for (int64_t j = 0; j < num_compare_per; j++) {
int64_t sum_init_addr = input_addr_base + j;
T1 sum_value = input_x_data_addr[sum_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = sum_init_addr + k * num_compare_per;
sum_value += input_x_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value;
}
} else {
uint32_t min_core_num = 1;
int64_t sum_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (sum_core_num > num_compare_per) {
sum_core_num = num_compare_per;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t j = start; j < end; j++) {
int64_t sum_init_addr = input_addr_base + j;
T1 sum_value = input_x_data_addr[sum_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = sum_init_addr + k * num_compare_per;
sum_value += input_x_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value;
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / sum_core_num, shard_compute),
"SegmentSum Compute failed.");
}
}
} else {
uint32_t min_core_num_seg = 1;
int64_t sum_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (sum_core_num_seg > num_segments) {
sum_core_num_seg = num_segments;
}
auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
for (size_t i = start_seg; i < end_seg; i++) {
int64_t count = segments[i];
int64_t count_no = 0;
for (size_t j = 0; j < i; j++) {
count_no += segments[j];
}
int64_t input_addr_base = count_no * num_compare_per;
for (int64_t j = 0; j < num_compare_per; j++) {
int64_t sum_init_addr = input_addr_base + j;
T1 sum_value = input_x_data_addr[sum_init_addr];
for (int64_t k = 1; k < count; k++) {
int cmp_addr = sum_init_addr + k * num_compare_per;
sum_value += input_x_data_addr[cmp_addr];
}
output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = sum_value;
}
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, num_segments, num_segments / sum_core_num_seg, shard_compute_seg),
"SegmentSum Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSegmentSum, SegmentSumCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SEGMENTSUM_H_
#define AICPU_KERNELS_NORMALIZED_SEGMENTSUM_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SegmentSumCpuKernel : public CpuKernel {
public:
SegmentSumCpuKernel() = default;
~SegmentSumCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2>
static uint32_t SegmentSumCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,282 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "slice.h"
#include "securec.h"
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include <iostream>
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 3;
const char *kSlice = "Slice";
#define SLICE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = SliceCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Slice kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t SliceCpuKernel::GetSliceValue(Tensor *tensor, std::vector<int64_t> &value) {
auto type = tensor->GetDataType();
if (type == DT_INT32) {
auto data = reinterpret_cast<int32_t *>(tensor->GetData());
for (int64_t i = 0; i < tensor->NumElements(); i++) {
value.push_back(static_cast<int64_t>(*(data + i)));
}
} else if (type == DT_INT64) {
auto data = reinterpret_cast<int64_t *>(tensor->GetData());
for (int64_t i = 0; i < tensor->NumElements(); i++) {
value.push_back(*(data + i));
}
} else {
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t SliceCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSlice);
KERNEL_HANDLE_ERROR(SliceCheck(ctx), "[%s] check params failed.", kSlice);
auto x_type = ctx.Input(0)->GetDataType();
switch (x_type) {
SLICE_COMPUTE_CASE(DT_BOOL, bool, ctx)
SLICE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
SLICE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
SLICE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
SLICE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
SLICE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
SLICE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
SLICE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
SLICE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
SLICE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
SLICE_COMPUTE_CASE(DT_FLOAT, float, ctx)
SLICE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
SLICE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
SLICE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("Slice kernel data type [%s] not support.", DTypeStr(x_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t SliceCpuKernel::SliceCheck(CpuKernelContext &ctx) {
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
KERNEL_CHECK_NULLPTR(ctx.Input(1)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
KERNEL_CHECK_NULLPTR(ctx.Input(kThirdInputIndex)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 2 data failed.")
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed.")
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 tensor shape failed.")
KERNEL_CHECK_NULLPTR(ctx.Input(1)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 tensor shape failed.")
KERNEL_CHECK_NULLPTR(ctx.Input(kThirdInputIndex)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID,
"Get input 2 tensor shape failed.")
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID,
"Get output 0 tensor shape failed.")
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_offsets = ctx.Input(1)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_size = ctx.Input(kThirdInputIndex)->GetTensorShape()->GetDimSizes();
auto offsets_tensor = ctx.Input(1);
auto size_tensor = ctx.Input(2);
auto y_tensor = ctx.Output(0);
KERNEL_CHECK_FALSE((offsets_tensor->NumElements() == static_cast<int64_t>(shape_x.size())),
KERNEL_STATUS_PARAM_INVALID, "Expected offsets to be 1-D tensors of size [%zu], but got [%zu].",
shape_x.size(), offsets_tensor->NumElements())
KERNEL_CHECK_FALSE((size_tensor->NumElements() == static_cast<int64_t>(shape_x.size())), KERNEL_STATUS_PARAM_INVALID,
"Expected size to be 1-D tensors of size [%zu], but got [%zu].", shape_x.size(),
size_tensor->NumElements())
KERNEL_CHECK_FALSE((GetSliceValue(offsets_tensor, offsets) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"Offsets must be either int32 or int64, but got [%s].",
DTypeStr(offsets_tensor->GetDataType()).c_str())
KERNEL_CHECK_FALSE((GetSliceValue(size_tensor, size) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"Size must be either int32 or int64, but got [%s].", DTypeStr(size_tensor->GetDataType()).c_str())
is_identity = true;
slice_dim0 = true;
std::vector<int64_t> shape_y;
for (size_t i = 0; i < shape_x.size(); ++i) {
if (size.at(i) == -1) {
size.at(i) = shape_x.at(i) - offsets.at(i);
}
int64_t offset = offsets.at(i);
int64_t size_dim = size.at(i);
if (shape_x.at(i) == 0) {
KERNEL_CHECK_FALSE((offset == 0 && size_dim == 0), KERNEL_STATUS_PARAM_INVALID,
"Expected offsets[%zu] == 0 (got %zu) and size[%zu] == 0 (got %zu),"
" when x shape[%zu] == 0.",
i, offset, i, size_dim, i)
} else {
KERNEL_CHECK_FALSE((0 <= offset && offset < shape_x.at(i)), KERNEL_STATUS_PARAM_INVALID,
"Expected offsets[%zu] in [0, %zu], but got %zu.", i, shape_x.at(i), offset)
KERNEL_CHECK_FALSE((0 <= size_dim && offset + size_dim <= shape_x.at(i)), KERNEL_STATUS_PARAM_INVALID,
"Expected size[%zu] in [0, %zu], but got %zu.", i, shape_x.at(i) - offset, size_dim)
}
bool take_all = (offset == 0) && (size_dim == shape_x.at(i));
is_identity &= take_all;
slice_dim0 &= (i == 0) || take_all;
shape_y.push_back(size_dim);
}
y_tensor->GetTensorShape()->SetDimSizes(shape_y);
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SliceCpuKernel::SliceCompute(CpuKernelContext &ctx) {
auto x_data = ctx.Input(0)->GetData();
auto y_data = ctx.Output(0)->GetData();
int64_t num_output = ctx.Output(0)->NumElements();
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_y = ctx.Output(0)->GetTensorShape()->GetDimSizes();
if (num_output == 0) {
return KERNEL_STATUS_OK;
}
if (is_identity) {
int64_t input_size = ctx.Input(0)->GetDataSize();
int cpret = memcpy_s(y_data, input_size, x_data, input_size);
KERNEL_CHECK_FALSE((cpret == EOK), KERNEL_STATUS_INNER_ERROR, "[%s] memcpy_s to output failed, size [%llu].",
kSlice, input_size);
return KERNEL_STATUS_OK;
}
if (slice_dim0) {
int data_size = size.at(0);
data_size = data_size * sizeof(T);
int cpret = memcpy_s(y_data, data_size, static_cast<T *>(x_data) + offsets.at(0), data_size);
KERNEL_CHECK_FALSE((cpret == EOK), KERNEL_STATUS_INNER_ERROR, "[%s] memcpy_s to output failed, size [%llu].",
kSlice, data_size);
return KERNEL_STATUS_OK;
}
auto input_data = reinterpret_cast<T *>(x_data);
auto output_data = reinterpret_cast<T *>(y_data);
size_t input_dims = shape_x.size();
switch (input_dims) {
case INPUT_NUM2: {
using Eigen_Tensor_2D =
Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM2), Eigen::RowMajor>, Eigen::Aligned>;
Eigen_Tensor_2D input_2D(input_data, shape_x.at(0), shape_x.at(1));
Eigen_Tensor_2D output_2D(output_data, shape_y.at(0), shape_y.at(1));
Eigen::array<Eigen::DenseIndex, INPUT_NUM2> offsets_2D;
Eigen::array<Eigen::DenseIndex, INPUT_NUM2> size_2D;
for (size_t i = 0; i < INPUT_NUM2; ++i) {
offsets_2D[i] = offsets.at(i);
size_2D[i] = size.at(i);
}
output_2D = input_2D.slice(offsets_2D, size_2D);
break;
}
case INPUT_NUM3: {
using Eigen_Tensor_3D =
Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM3), Eigen::RowMajor>, Eigen::Aligned>;
Eigen_Tensor_3D input_3D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(INPUT_NUM2));
Eigen_Tensor_3D output_3D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(INPUT_NUM2));
Eigen::array<Eigen::DenseIndex, INPUT_NUM3> offsets_3D;
Eigen::array<Eigen::DenseIndex, INPUT_NUM3> size_3D;
for (size_t i = 0; i < INPUT_NUM3; ++i) {
offsets_3D[i] = offsets.at(i);
size_3D[i] = size.at(i);
}
output_3D = input_3D.slice(offsets_3D, size_3D);
break;
}
case INPUT_NUM4: {
using Eigen_Tensor_4D =
Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM4), Eigen::RowMajor>, Eigen::Aligned>;
Eigen_Tensor_4D input_4D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(INPUT_NUM2),
shape_x.at(INPUT_NUM3));
Eigen_Tensor_4D output_4D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(INPUT_NUM2),
shape_y.at(INPUT_NUM3));
Eigen::array<Eigen::DenseIndex, INPUT_NUM4> offsets_4D;
Eigen::array<Eigen::DenseIndex, INPUT_NUM4> size_4D;
for (size_t i = 0; i < INPUT_NUM4; ++i) {
offsets_4D[i] = offsets.at(i);
size_4D[i] = size.at(i);
}
output_4D = input_4D.slice(offsets_4D, size_4D);
break;
}
case INPUT_NUM5: {
using Eigen_Tensor_5D =
Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM5), Eigen::RowMajor>, Eigen::Aligned>;
Eigen_Tensor_5D input_5D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(INPUT_NUM2), shape_x.at(INPUT_NUM3),
shape_x.at(INPUT_NUM4));
Eigen_Tensor_5D output_5D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(INPUT_NUM2),
shape_y.at(INPUT_NUM3), shape_y.at(INPUT_NUM4));
Eigen::array<Eigen::DenseIndex, INPUT_NUM5> offsets_5D;
Eigen::array<Eigen::DenseIndex, INPUT_NUM5> size_5D;
for (size_t i = 0; i < INPUT_NUM5; ++i) {
offsets_5D[i] = offsets.at(i);
size_5D[i] = size.at(i);
}
output_5D = input_5D.slice(offsets_5D, size_5D);
break;
}
case INPUT_NUM6: {
using Eigen_Tensor_6D =
Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM6), Eigen::RowMajor>, Eigen::Aligned>;
Eigen_Tensor_6D input_6D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(INPUT_NUM2), shape_x.at(INPUT_NUM3),
shape_x.at(INPUT_NUM4), shape_x.at(INPUT_NUM5));
Eigen_Tensor_6D output_6D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(INPUT_NUM2),
shape_y.at(INPUT_NUM3), shape_y.at(INPUT_NUM4), shape_y.at(INPUT_NUM5));
Eigen::array<Eigen::DenseIndex, INPUT_NUM6> offsets_6D;
Eigen::array<Eigen::DenseIndex, INPUT_NUM6> size_6D;
for (size_t i = 0; i < INPUT_NUM6; ++i) {
offsets_6D[i] = offsets.at(i);
size_6D[i] = size.at(i);
}
output_6D = input_6D.slice(offsets_6D, size_6D);
break;
}
case INPUT_NUM7: {
using Eigen_Tensor_7D =
Eigen::TensorMap<Eigen::Tensor<T, static_cast<int>(INPUT_NUM7), Eigen::RowMajor>, Eigen::Aligned>;
Eigen_Tensor_7D input_7D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(INPUT_NUM2), shape_x.at(INPUT_NUM3),
shape_x.at(INPUT_NUM4), shape_x.at(INPUT_NUM5), shape_x.at(INPUT_NUM6));
Eigen_Tensor_7D output_7D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(INPUT_NUM2),
shape_y.at(INPUT_NUM3), shape_y.at(INPUT_NUM4), shape_y.at(INPUT_NUM5),
shape_y.at(INPUT_NUM6));
Eigen::array<Eigen::DenseIndex, INPUT_NUM7> offsets_7D;
Eigen::array<Eigen::DenseIndex, INPUT_NUM7> size_7D;
for (size_t i = 0; i < INPUT_NUM7; ++i) {
offsets_7D[i] = offsets.at(i);
size_7D[i] = size.at(i);
}
output_7D = input_7D.slice(offsets_7D, size_7D);
break;
}
default:
KERNEL_LOG_ERROR("[%s] : Unhandled input dimensions [%zu].", kSlice, input_dims);
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSlice, SliceCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,44 @@
/**
* Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SLICE_H_
#define AICPU_KERNELS_NORMALIZED_SLICE_H_
#include "cpu_ops_kernel.h"
#include <vector>
namespace aicpu {
class SliceCpuKernel : public CpuKernel {
public:
SliceCpuKernel() = default;
~SliceCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
bool is_identity;
bool slice_dim0;
std::vector<int64_t> offsets;
std::vector<int64_t> size;
uint32_t GetSliceValue(Tensor *tensor, std::vector<int64_t> &value);
uint32_t SliceCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t SliceCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,518 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cpu_ops_kernel.h"
#include <string>
#include "sparse_cross.h"
#include <iostream>
namespace {
static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
static const uint64_t k1 = 0xb492b66fbe98f273ULL;
static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
const char *kSparseCross = "SparseCross";
} // namespace
namespace aicpu {
typedef std::pair<uint64_t, uint64_t> uint128_t;
inline uint64_t Uint128Low64(const uint128_t x) { return x.first; }
inline uint64_t Uint128High64(const uint128_t x) { return x.second; }
inline uint128_t Uint128(uint64_t lo, uint64_t hi) { return uint128_t(lo, hi); }
#define STATIC_INLINE static inline
using namespace std;
using ui = unsigned int;
using ul = unsigned long;
using uc = unsigned char;
using ull = unsigned long long;
static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
static const uint64_t k1 = 0xb492b66fbe98f273ULL;
static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
STATIC_INLINE uint64_t Fetch64(const char *p) {
uint64_t result;
memcpy(&result, p, sizeof(result));
return uint64_in_expected_order(result);
}
STATIC_INLINE uint32_t Fetch32(const char *p) {
uint32_t result;
memcpy(&result, p, sizeof(result));
return uint32_in_expected_order(result);
}
STATIC_INLINE uint64_t Hash128to64(uint128_t x) {
const uint64_t kMul = 0x9ddfea08eb382d69ULL;
uint64_t a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
uint64_t value = 47;
a ^= (a >> value);
uint64_t b = (Uint128High64(x) ^ a) * kMul;
b ^= (b >> value);
b *= kMul;
return b;
}
STATIC_INLINE uint64_t ShiftMix(uint64_t val) {
uint64_t value = 47;
return val ^ (val >> value);
}
STATIC_INLINE uint64_t HashLen16(uint64_t u, uint64_t v, uint64_t mul) {
uint64_t a = (u ^ v) * mul;
uint64_t value = 47;
a ^= (a >> value);
uint64_t b = (v ^ a) * mul;
b ^= (b >> value);
b *= mul;
return b;
}
STATIC_INLINE uint64_t HashLen0to16(const char *s, size_t len) {
if (len > 0) {
uint8_t a = s[0];
uint8_t b = s[len >> 1];
uint8_t c = s[len - 1];
uint32_t y = static_cast<uint32_t>(a) + (static_cast<uint32_t>(b) << 8);
uint32_t z = len + (static_cast<uint32_t>(c) << 2);
return ShiftMix(y * k2 ^ z * k0) * k2;
}
return k2;
}
uint64_t FarmHash64(const char *s, size_t len) { return HashLen0to16(s, len); }
uint64_t Fingerprint64(const string s) { return FarmHash64(s.data(), s.size()); }
template <typename InternalType>
class ColumnInterface {
public:
virtual int64_t FeatureCount(int64_t batch) const = 0;
virtual InternalType Feature(int64_t batch, int64_t n) const = 0;
virtual ~ColumnInterface() {}
};
template <typename InternalType>
class SparseTensorColumn : public ColumnInterface<InternalType> {
public:
SparseTensorColumn(Tensor *values, std::vector<int64_t> feature_counts, std::vector<int64_t> feature_start_indices)
: values_(values),
feature_counts_(std::move(feature_counts)),
feature_start_indices_(std::move(feature_start_indices)) {
if (feature_counts_.size() != feature_start_indices_.size()) {
KERNEL_LOG_ERROR("feature_counts_ is not equal to feature_start_indices_.");
}
}
int64_t FeatureCount(int64_t batch) const override { return feature_counts_[batch]; }
InternalType Feature(int64_t batch, int64_t n) const override;
~SparseTensorColumn() override {}
private:
Tensor *values_;
std::vector<int64_t> feature_counts_;
std::vector<int64_t> feature_start_indices_;
};
template <>
std::string SparseTensorColumn<std::string>::Feature(int64_t batch, int64_t n) const {
const int64_t start = feature_start_indices_[batch];
EigenTensor values_e(values_, values_->GetData());
if (DT_STRING == values_->GetDataType()) return values_e.vec<std::string>().data()[start + n];
return std::to_string(values_e.vec<int64_t>().data()[start + n]);
}
template <>
int64_t SparseTensorColumn<int64_t>::Feature(int64_t batch, int64_t n) const {
const int64_t start = feature_start_indices_[batch];
EigenTensor values_e(values_, values_->GetData());
if (DT_STRING == values_->GetDataType()) {
return Fingerprint64(values_e.vec<std::string>().data()[start + n]);
}
return values_e.vec<int64_t>().data()[start + n];
}
template <typename InternalType>
class DenseTensorColumn : public ColumnInterface<InternalType> {
public:
explicit DenseTensorColumn(Tensor *tensor) : tensor_(tensor) {}
int64_t FeatureCount(int64_t batch) const override { return tensor_->GetTensorShape()->GetDimSize(1); }
InternalType Feature(int64_t batch, int64_t n) const override;
~DenseTensorColumn() override {}
private:
Tensor *tensor_;
};
template <>
int64_t DenseTensorColumn<int64_t>::Feature(int64_t batch, int64_t n) const {
EigenTensor tensor_e(tensor_, tensor_->GetData());
if (DT_STRING == tensor_->GetDataType()) return Fingerprint64(tensor_e.matrix<std::string>()(batch, n));
return tensor_e.matrix<int64_t>()(batch, n);
}
template <>
std::string DenseTensorColumn<std::string>::Feature(int64_t batch, int64_t n) const {
EigenTensor tensor_e(tensor_, tensor_->GetData());
if (DT_STRING == tensor_->GetDataType()) return tensor_e.matrix<std::string>()(batch, n);
return std::to_string(tensor_e.matrix<int64_t>()(batch, n));
}
template <typename OutType>
class OutputUpdater {
public:
OutputUpdater(const std::vector<int64_t> &output_start_indices, Tensor *indices_out, Tensor *values_out)
: output_start_indices_(output_start_indices), indices_out_(indices_out), values_out_(values_out) {}
void Update(const int64_t batch_index, const int64_t cross_count, const OutType &cross) const {
const int64_t output_index = output_start_indices_[batch_index] + cross_count;
auto indices_out_addr = static_cast<int64_t *>(indices_out_->GetData());
int64_t value = 2;
indices_out_addr[output_index * value] = batch_index;
indices_out_addr[output_index * value + 1] = cross_count;
auto values_out_addr = static_cast<OutType *>(values_out_->GetData());
values_out_addr[output_index] = cross;
}
private:
const std::vector<int64_t> &output_start_indices_;
Tensor *indices_out_;
Tensor *values_out_;
};
template <typename InternalType>
class StringCrosser {
public:
StringCrosser(const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns,
const int64_t num_buckets_unused, const uint64_t hash_key_unused)
: columns_(columns) {}
std::string Generate(const int64_t batch_index, const std::vector<int64_t> &permutation) const {
static const auto k_feature_separator = "_X_";
std::vector<InternalType> cross_vec(columns_.size());
for (size_t i = 0; i < permutation.size(); i++) {
cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i]);
}
size_t i;
string str1 = "";
for (i = 0; i < cross_vec.size() - 1; i++) {
str1 = str1 + cross_vec[i].data();
str1 = str1 + k_feature_separator;
}
str1 = str1 + cross_vec[i].data();
return str1;
}
private:
const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns_;
};
class HashCrosser {
public:
HashCrosser(const std::vector<std::unique_ptr<ColumnInterface<int64_t>>> &columns, const int64_t num_buckets,
const uint64_t hash_key)
: columns_(columns), num_buckets_(num_buckets), hash_key_(hash_key) {}
uint64_t ShiftMix(const uint64_t val) const { return val ^ (val >> 47); }
uint64_t FingerprintCat64(const uint64_t fp1, const uint64_t fp2) const {
static const uint64_t kMul = 0xc6a4a7935bd1e995ULL;
uint64_t result = fp1 ^ kMul;
result ^= ShiftMix(fp2 * kMul) * kMul;
result *= kMul;
result = ShiftMix(result) * kMul;
result = ShiftMix(result);
return result;
}
int64_t Generate(const int64_t batch_index, const std::vector<int64_t> &permutation) const {
uint64_t hashed_output = hash_key_;
for (size_t i = 0; i < permutation.size(); ++i) {
uint64_t hash_i = columns_[i]->Feature(batch_index, permutation[i]);
hashed_output = FingerprintCat64(hashed_output, hash_i);
}
if (num_buckets_ > 0) {
return hashed_output % num_buckets_;
} else {
return hashed_output % std::numeric_limits<int64_t>::max();
}
}
private:
const std::vector<std::unique_ptr<ColumnInterface<int64_t>>> &columns_;
const int64_t num_buckets_;
const uint64_t hash_key_;
};
template <typename InternalType>
class ProductIterator {
public:
explicit ProductIterator(const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns,
int64_t batch_index)
: columns_(columns), batch_index_(batch_index) {
next_permutation_.resize(columns_.size(), 0);
has_next_ = true;
for (size_t i = 0; i < columns_.size(); i++) {
if (columns_[i]->FeatureCount(batch_index_) == 0) {
has_next_ = false;
break;
}
}
}
std::vector<int64_t> Next() {
std::vector<int64_t> permutation(next_permutation_);
bool carry = true;
for (int64_t i = next_permutation_.size() - 1; i >= 0; i--) {
if (carry) {
next_permutation_[i] = next_permutation_[i] + 1;
}
if (next_permutation_[i] == columns_[i]->FeatureCount(batch_index_)) {
next_permutation_[i] = 0;
} else {
carry = false;
break;
}
}
has_next_ = !carry;
return permutation;
}
bool HasNext() { return has_next_; }
private:
bool has_next_;
const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns_;
const int64_t batch_index_;
std::vector<int64_t> next_permutation_;
};
template <bool HASHED_OUTPUT, typename InternalType>
struct CrossTraits;
template <typename InternalType>
struct CrossTraits<false, InternalType> {
typedef StringCrosser<InternalType> Crosser;
typedef OutputUpdater<std::string> Updater;
};
template <>
struct CrossTraits<true, int64_t> {
typedef HashCrosser Crosser;
typedef OutputUpdater<int64_t> Updater;
};
int64_t CalculateBatchSize(const OpInputList &shapes_list_in, const OpInputList &dense_list_in) {
EigenTensor shapes_list_in_e(shapes_list_in[0], shapes_list_in[0]->GetData());
if (shapes_list_in.size() > 0) {
return shapes_list_in_e.vec<int64_t>()(0);
}
if (dense_list_in.size() > 0) {
return dense_list_in[0]->GetTensorShape()->GetDimSize(0);
}
return 0;
}
void ExtractFeatureData(const OpInputList &indices_list_in, int64_t batch_size,
std::vector<std::vector<int64_t>> *feature_counts,
std::vector<std::vector<int64_t>> *feature_start_indices) {
std::vector<int64_t> current_row(indices_list_in.size());
for (int64_t b = 0; b < batch_size; b++) {
for (int64_t i = 0; i < indices_list_in.size(); i++) {
EigenTensor indices_list_in_e(indices_list_in[i], indices_list_in[i]->GetData());
const auto indices = indices_list_in_e.matrix<int64_t>();
int64_t feature_count = 0;
int64_t start_index = current_row[i];
while (current_row[i] < indices_list_in[i]->GetTensorShape()->GetDimSize(0) && indices(current_row[i], 0) == b) {
feature_count++;
current_row[i]++;
}
(*feature_counts)[i].push_back(feature_count);
(*feature_start_indices)[i].push_back(start_index);
}
}
}
template <typename InternalType>
std::vector<std::unique_ptr<ColumnInterface<InternalType>>> ColumnsFromInput(const OpInputList &indices_list_in,
const OpInputList &values_list_in,
const OpInputList &shapes_list_in,
const OpInputList &dense_list_in) {
std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
const int64_t batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
const int64_t number_of_columns = shapes_list_in.size();
std::vector<std::vector<int64_t>> feature_counts(number_of_columns, std::vector<int64_t>());
std::vector<std::vector<int64_t>> feature_start_indices(number_of_columns, std::vector<int64_t>());
ExtractFeatureData(indices_list_in, batch_size, &feature_counts, &feature_start_indices);
columns.reserve(values_list_in.size());
for (int64_t i = 0; i < values_list_in.size(); ++i) {
columns.emplace_back(
new SparseTensorColumn<InternalType>(values_list_in[i], feature_counts[i], feature_start_indices[i]));
}
for (int64_t i = 0; i < dense_list_in.size(); ++i) {
columns.emplace_back(new DenseTensorColumn<InternalType>(dense_list_in[i]));
}
return columns;
}
template <typename InternalType>
int64_t CrossCountByBatchIndex(const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns,
int64_t batch_index) {
int64_t cross_count = 1;
for (size_t i = 0; i < columns.size(); i++) {
const auto feature_count = columns[i]->FeatureCount(batch_index);
if (feature_count == 0) {
return 0;
}
cross_count *= feature_count;
}
return cross_count;
}
template <typename InternalType>
void CreateOutputTensors(const std::vector<std::unique_ptr<ColumnInterface<InternalType>>> &columns, int64_t batch_size,
CpuKernelContext *context, Tensor *indices_out, Tensor *values_out, Tensor *shape_out,
std::vector<int64_t> *output_start_indices) {
int64_t cross_count_total = 0;
int64_t max_cross_count = 0;
for (int64_t b = 0; b < batch_size; b++) {
(*output_start_indices)[b] = cross_count_total;
const auto cross_count = CrossCountByBatchIndex<InternalType>(columns, b);
max_cross_count = std::max(max_cross_count, cross_count);
cross_count_total += cross_count;
}
indices_out = context->Output(0);
std::vector<int64_t> indices_t;
int64_t value1 = 1;
int64_t value2 = 2;
indices_t.reserve(value2);
indices_t.push_back(cross_count_total);
indices_t.push_back(value2);
indices_out->GetTensorShape()->SetDimSizes(indices_t);
indices_out->SetDataType(DT_INT64);
values_out = context->Output(value1);
std::vector<int64_t> values_t;
values_t.reserve(value1);
values_t.push_back(cross_count_total);
values_out->GetTensorShape()->SetDimSizes(values_t);
shape_out = context->Output(value2);
std::vector<int64_t> shape_t;
shape_t.reserve(value1);
shape_t.push_back(value2);
auto shape_vec = static_cast<int64_t *>(shape_out->GetData());
shape_vec[0] = batch_size;
shape_vec[1] = max_cross_count;
shape_out->GetTensorShape()->SetDimSizes(shape_t);
}
template <bool HASHED_OUTPUT, typename InternalType>
uint32_t SparseCrossCpuKernel::SparseCrossCompute(CpuKernelContext &ctx) {
auto num_buckets_ptr = ctx.GetAttr("num_buckets");
uint32_t inputSize = ctx.GetInputsSize();
int64_t num_buckets_ = 0;
int64_t num = inputSize / 3;
uint64_t hash_key_ = ctx.GetAttr("hash_key")->GetInt();
auto num_ptr = ctx.GetAttr("N");
if (num_ptr != nullptr) {
num = num_ptr->GetInt();
} else {
if (inputSize % 3 == 0) num = num - 1;
}
if (num_buckets_ptr != nullptr) {
num_buckets_ = num_buckets_ptr->GetInt();
}
uint32_t start1 = 0;
uint32_t stop = num;
OpInputList indices_list_in(&ctx, start1, stop);
start1 = start1 + num;
stop = start1 + num;
OpInputList values_list_in(&ctx, start1, stop);
start1 = start1 + num;
stop = start1 + num;
OpInputList shapes_list_in(&ctx, start1, stop);
start1 = start1 + num;
OpInputList dense_list_in(&ctx, start1, inputSize);
const auto size = indices_list_in.size();
int64_t value = 2;
for (int64_t i = 0; i < size; i++) {
if (indices_list_in[i]->GetTensorShape()->GetDimSize(1) != value) {
KERNEL_LOG_ERROR("Expected D2 of index to be 2 got [%d], at position [%d].",
indices_list_in[i]->GetTensorShape()->GetDimSize(1), i);
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (int64_t i = 0; i < size; i++) {
if (indices_list_in[i]->GetTensorShape()->GetDimSize(0) != values_list_in[i]->GetTensorShape()->GetDimSize(0)) {
KERNEL_LOG_ERROR("Expected size of values to be [%d], but got [%d] at position [%d].",
indices_list_in[i]->GetTensorShape()->GetDimSize(0),
values_list_in[i]->GetTensorShape()->GetDimSize(0), i);
return KERNEL_STATUS_PARAM_INVALID;
}
}
const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
for (int64_t i = 0; i < size; i++) {
EigenTensor shapes_list_in_e(shapes_list_in[i], shapes_list_in[i]->GetData());
int64_t value = 2;
if (shapes_list_in_e.vec<int64_t>().size() != value) {
KERNEL_LOG_ERROR("shape should imply a 2D tensor, but got [%d].", shapes_list_in[i]->GetTensorShape());
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (int64_t i = 0; i < dense_list_in.size(); ++i) {
if (dense_list_in[i]->GetTensorShape()->GetDimSize(0) != batch_size) {
KERNEL_LOG_ERROR("Expected batch size [%d],got [%d].", batch_size,
dense_list_in[i]->GetTensorShape()->GetDimSize(0));
return KERNEL_STATUS_PARAM_INVALID;
}
}
std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns =
ColumnsFromInput<InternalType>(indices_list_in, values_list_in, shapes_list_in, dense_list_in);
typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(columns, num_buckets_, hash_key_);
Tensor *indices_out = ctx.Output(0);
Tensor *values_out = ctx.Output(1);
Tensor *shape_out = ctx.Output(2);
std::vector<int64_t> output_start_indices(batch_size);
CreateOutputTensors(columns, batch_size, &ctx, indices_out, values_out, shape_out, &output_start_indices);
typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater updater(output_start_indices, indices_out, values_out);
for (int64_t b = 0; b < batch_size; b++) {
ProductIterator<InternalType> product_iterator(columns, b);
int64_t cross_count = 0;
while (product_iterator.HasNext()) {
const auto permutation = product_iterator.Next();
updater.Update(b, cross_count, crosser.Generate(b, permutation));
cross_count++;
}
}
return KERNEL_STATUS_OK;
}
uint32_t SparseCrossCpuKernel::Compute(CpuKernelContext &ctx) {
bool hash_out = ctx.GetAttr("hashed_output")->GetBool();
DataType intertype = ctx.GetAttr("internal_type")->GetDataType();
if (hash_out == 0) {
if (intertype == 0) {
uint32_t res = SparseCrossCompute<false, string>(ctx);
if (res == 1) {
return KERNEL_STATUS_PARAM_INVALID;
}
}
} else if (hash_out == 1) {
uint32_t res = SparseCrossCompute<true, int64_t>(ctx);
if (res == 1) {
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSparseCross, SparseCrossCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,121 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSECROSS_H_
#define AICPU_KERNELS_NORMALIZED_SPARSECROSS_H_
#include <algorithm>
#include <numeric>
#include <vector>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace swap {
#define STATIC_INLINE static inline
#define BSWAP_8(x) ((x)&0xff)
#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16))
#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
#define uint32_in_expected_order(x) (x)
#define uint64_in_expected_order(x) (BSWAP_64(x))
} // namespace swap
namespace aicpu {
class SparseCrossCpuKernel : public CpuKernel {
public:
SparseCrossCpuKernel() = default;
~SparseCrossCpuKernel() override = default;
protected:
// template <bool HASHED_OUTPUT, typename InternalType>
uint32_t Compute(CpuKernelContext &ctx);
private:
template <bool HASHED_OUTPUT, typename InternalType>
uint32_t SparseCrossCompute(CpuKernelContext &ctx);
int64_t num_buckets_;
uint64_t hash_key_;
};
template <typename ListType, typename ElementType>
class OpArgIterator {
public:
using iterator_category = std::forward_iterator_tag;
using value_type = ElementType;
using pointer = ElementType *;
using const_pointer = const ElementType *;
using reference = ElementType &;
using const_reference = const ElementType &;
using difference_type = ptrdiff_t;
OpArgIterator(const ListType *list, int i) : list_(list), i_(i) {}
bool operator==(const OpArgIterator &rhs) {
if (list_ == rhs.list_) {
return i_ == rhs.i_;
}
return false;
}
bool operator!=(const OpArgIterator &rhs) {
if (list_ == rhs.list_) {
return i_ != rhs.i_;
}
return true;
}
OpArgIterator operator++() { // prefix ++it
++i_;
return *this;
}
OpArgIterator operator++(int) { // postfix it++
OpArgIterator old_value = *this;
++i_;
return old_value;
}
reference operator*() { return (*list_)[i_]; }
pointer operator->() { return &(*list_)[i_]; }
const_reference operator*() const { return (*list_)[i_]; }
const_pointer operator->() const { return &(*list_)[i_]; }
private:
const ListType *const list_;
int i_;
};
class OpInputList {
public:
using Iterator = OpArgIterator<OpInputList, const Tensor>;
OpInputList() : ctx_(nullptr), start_(0), stop_(0) {}
OpInputList(CpuKernelContext *ctx, uint32_t start, uint32_t stop) : ctx_(ctx), start_(start), stop_(stop) {}
OpInputList &operator=(const OpInputList &other) = default;
OpInputList(const OpInputList &other) = default;
Tensor *operator[](uint32_t i) const { return ctx_->Input(start_ + i); }
uint32_t size() const { return stop_ - start_; }
Iterator begin() const { return Iterator(this, 0); }
Iterator end() const { return Iterator(this, size()); }
private:
CpuKernelContext *ctx_; // not owned
uint32_t start_;
uint32_t stop_;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,180 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_segment_mean_with_num_segments.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 4;
const uint32_t kOutputNum = 1;
const char *SparseSegmentMeanWithNumSegments = "SparseSegmentMeanWithNumSegments";
#define COMPUTE_CASE(DTYPE, TYPE, DTYPE_1, DTYPE_2, DTYPE_3, CTX) \
case (DTYPE): \
if ((DTYPE_1) == DT_INT32) { \
if ((DTYPE_2) == DT_INT32 && (DTYPE_3) == DT_INT32) { \
return ComputeKernel<TYPE, int32_t, int32_t, int32_t>(CTX); \
} else if ((DTYPE_2) == DT_INT32 && (DTYPE_3) != DT_INT32) { \
return ComputeKernel<TYPE, int32_t, int32_t, int64_t>(CTX); \
} else if ((DTYPE_2) != DT_INT32 && (DTYPE_3) == DT_INT32) { \
return ComputeKernel<TYPE, int32_t, int64_t, int32_t>(CTX); \
} else if ((DTYPE_2) != DT_INT32 && (DTYPE_3) != DT_INT32) { \
return ComputeKernel<TYPE, int32_t, int64_t, int64_t>(CTX); \
} \
} else { \
if ((DTYPE_2) == DT_INT32 && (DTYPE_3) == DT_INT32) { \
return ComputeKernel<TYPE, int64_t, int32_t, int32_t>(CTX); \
} else if ((DTYPE_2) == DT_INT32 && (DTYPE_3) != DT_INT32) { \
return ComputeKernel<TYPE, int64_t, int32_t, int64_t>(CTX); \
} else if ((DTYPE_2) != DT_INT32 && (DTYPE_3) == DT_INT32) { \
return ComputeKernel<TYPE, int64_t, int64_t, int32_t>(CTX); \
} else if ((DTYPE_2) != DT_INT32 && (DTYPE_3) != DT_INT32) { \
return ComputeKernel<TYPE, int64_t, int64_t, int64_t>(CTX); \
} \
} \
break;
} // namespace
namespace aicpu {
uint32_t SparseSegmentMeanWithNumSegmentsCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentMeanWithNumSegments normalcheck failed.");
Tensor *x = ctx.Input(0);
Tensor *indices = ctx.Input(1);
Tensor *segment_ids = ctx.Input(2);
Tensor *num_segments = ctx.Input(3);
if (x->GetDataSize() == 0 || indices->GetDataSize() == 0 || segment_ids->GetDataSize() == 0 ||
num_segments->GetDataSize() == 0) {
KERNEL_LOG_ERROR("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto x_shape = x->GetTensorShape();
auto indices_shape = indices->GetTensorShape();
auto segment_ids_shape = segment_ids->GetTensorShape();
auto num_segments_shape = num_segments->GetTensorShape();
if (x_shape->GetDims() < 1) {
KERNEL_LOG_ERROR("[%s] Tensor x's rank less than 1.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (indices_shape->NumElements() != segment_ids_shape->NumElements()) {
KERNEL_LOG_ERROR("[%s] Tensor indices&segment_ids's ranks mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto x_data_type = x->GetDataType();
auto indices_data_type = indices->GetDataType();
auto segment_ids_data_type = segment_ids->GetDataType();
auto num_segments_data_type = num_segments->GetDataType();
if (indices_data_type != DT_INT32 && indices_data_type != DT_INT64) {
KERNEL_LOG_ERROR("SparseSegmentMeanWithNumSegments kernel data type [%s] not support.",
DTypeStr(indices_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_type != DT_INT32 && segment_ids_data_type != DT_INT64) {
KERNEL_LOG_ERROR("SparseSegmentMeanWithNumSegments kernel data type [%s] not support.",
DTypeStr(segment_ids_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (num_segments_data_type != DT_INT32 && num_segments_data_type != DT_INT64) {
KERNEL_LOG_ERROR("SparseSegmentMeanWithNumSegments kernel data type [%s] not support.",
DTypeStr(num_segments_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
switch (x_data_type) {
COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
COMPUTE_CASE(DT_FLOAT, float, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
default:
KERNEL_LOG_ERROR("SparseSegmentMeanWithNumSegments kernel data type [%s] not support.",
DTypeStr(x_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(SparseSegmentMeanWithNumSegments, SparseSegmentMeanWithNumSegmentsCpuKernel);
template <typename T1, typename T2, typename T3, typename T4>
uint32_t SparseSegmentMeanWithNumSegmentsCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
int n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
int m = ctx.Input(2)->GetTensorShape()->NumElements();
auto x_ptr = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
auto indices_ptr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
auto segment_ids_ptr = reinterpret_cast<T3 *>(ctx.Input(2)->GetData());
auto num_segments_ptr = reinterpret_cast<T4 *>(ctx.Input(3)->GetData());
auto y_ptr = reinterpret_cast<T1 *>(ctx.Output(0)->GetData());
std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
y_shape_values[0] = num_segments_ptr[0];
ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
for (int64_t i = 1; i < m; i++) {
if (segment_ids_ptr[i] < segment_ids_ptr[i - 1]) {
KERNEL_LOG_ERROR("segment_ids should be sorted.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (int64_t i = 0; i < m; i++) {
if (indices_ptr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
KERNEL_LOG_ERROR("indices out of range.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_ptr[i] >= num_segments_ptr[0]) {
KERNEL_LOG_ERROR("segment_ids out of range.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (int64_t i = 0; i < ctx.Output(0)->GetTensorShape()->NumElements(); i++) {
y_ptr[i] = (T1)0;
}
int oldindex = -1;
int countnum = 0;
for (int64_t i = 0; i < m; i++) {
if (oldindex == segment_ids_ptr[i]) {
countnum++;
} else if (countnum != 0) {
for (int64_t j = 0; j < n; j++) {
y_ptr[j + oldindex * n] /= static_cast<T1>(countnum);
}
countnum = 1;
oldindex = segment_ids_ptr[i];
} else {
countnum = 1;
oldindex = segment_ids_ptr[i];
}
for (int64_t j = 0; j < n; j++) {
y_ptr[j + oldindex * n] += x_ptr[j + indices_ptr[i] * n];
}
}
if (countnum != 0) {
for (int64_t j = 0; j < n; j++) {
y_ptr[j + oldindex * n] /= static_cast<T1>(countnum);
}
}
return KERNEL_STATUS_OK;
};
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_MEAN_WITH_NUM_SEGMENTS_H_
#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_MEAN_WITH_NUM_SEGMENTS_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
namespace aicpu {
class SparseSegmentMeanWithNumSegmentsCpuKernel : public CpuKernel {
public:
SparseSegmentMeanWithNumSegmentsCpuKernel() = default;
~SparseSegmentMeanWithNumSegmentsCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2, typename T3, typename T4>
static uint32_t ComputeKernel(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,282 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_slice.h"
#include <unistd.h>
#include <complex>
#include <string>
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "utils/sparse_tensor.h"
namespace {
const uint32_t kOutputNum = 3;
const uint32_t kInputNum = 5;
const char *kSparseSlice = "SparseSlice";
} // namespace
namespace aicpu {
template <typename T>
using ArraySlice = std::vector<T>;
template <typename T>
void Slice(Tensor *output_indices, Tensor *output_values, Tensor *output_dense_shape, SparseTensor *input_tensor,
const aicpu::ArraySlice<int64_t> &start, const aicpu::ArraySlice<int64_t> &size) {
auto output_shape = CpuKernelUtils::CreateTensorShape();
output_shape->SetDimSizes(input_tensor->shape());
auto output_shape_num_dims = CpuKernelUtils::CreateTensorShape();
output_shape_num_dims->SetDimSizes(input_tensor->shape());
const int dims = input_tensor->dims();
std::vector<int64_t> dimsVec(dims, 0);
for (int dim = 0; dim < dims; dim++) {
// Determine the size of the result; if the selected slice goes beyond the
// input boundary, the result will correspond to the size of the overlap
// between the input and the selected slice.
const auto input_size = output_shape->GetDimSize(dim);
const int64_t start_index = start[dim];
const int64_t slice_size = size[dim];
if (start_index + slice_size < input_size) {
dimsVec[dim] = slice_size;
} else if (start_index < input_size) {
dimsVec[dim] = input_size - start_index;
} else {
dimsVec[dim] = 0;
}
}
output_shape->SetDimSizes(dimsVec);
auto input_indices_t = input_tensor->indices().get()->matrix<int64_t>();
auto input_values_t = input_tensor->values().get()->vec<T>();
// Find the number of indices that fall inside start and size.
int count = 0;
int dim_size = input_tensor->indices()->GetTensor()->GetTensorShape()->GetDimSize(0);
for (int i = 0; i < dim_size; i++) {
// The following will check to see if an input is within the
// range specified by start and size.
// The for loop below iterates through all dimensions. In case
// the index falls outside of the start and size at any dimension,
// it will be considered as a "no hit" (hit = false). In this
// case, it will not be counted as the index that fall inside
// the range specified by start and size.
bool hit = true;
for (int dim = 0; dim < dims; dim++) {
if (!(start[dim] <= input_indices_t(i, dim) && input_indices_t(i, dim) < start[dim] + size[dim])) {
hit = false;
break;
}
}
if (!hit) {
continue;
}
count++;
}
auto eigen_tensor_indices = EigenTensor(output_indices, output_indices->GetData());
auto eigen_tensor_values = EigenTensor(output_values, output_values->GetData());
auto eigen_tensor_shape = EigenTensor(output_dense_shape, output_dense_shape->GetData());
auto output_values_t = eigen_tensor_values.vec<T>();
auto output_indices_t = eigen_tensor_indices.matrix<int64_t>();
auto output_shape_t = eigen_tensor_shape.vec<int64_t>();
// Obtain the output indices that fall inside start and size.
for (int dim = 0; dim < output_dense_shape->NumElements(); ++dim) {
const auto input_size = output_shape->GetDimSize(dim);
output_shape_t(dim) = input_size;
}
int index = 0;
for (int i = 0; i < dim_size && index < count; i++) {
// The logic here is similar as the above except that the above
// only count the number of indices while here we actually generate
// the output.
bool hit = true;
for (int dim = 0; dim < dims; dim++) {
if (!(start[dim] <= input_indices_t(i, dim) && input_indices_t(i, dim) < start[dim] + size[dim])) {
hit = false;
break;
}
}
if (!hit) {
continue;
}
output_values_t(index) = input_values_t(i);
for (int64_t dim = 0; dim < dims; dim++) {
output_indices_t(index, dim) = input_indices_t(i, dim) - start[dim];
}
index++;
}
const int num_dims = dims;
const int64_t y_nnz = index;
std::vector<int64_t> indices_dims = {y_nnz, num_dims};
auto output_indices_shape = output_indices->GetTensorShape();
output_indices_shape->SetDimSizes(indices_dims);
output_indices->SetTensorShape(output_indices_shape.get());
std::vector<int64_t> values_dims = {y_nnz};
auto output_values_shape = output_values->GetTensorShape();
output_values_shape->SetDimSizes(values_dims);
output_values->SetTensorShape(output_values_shape.get());
}
std::uint32_t SparseSliceCpuKernel::Compute(CpuKernelContext &ctx) {
Tensor *indices = ctx.Input(0);
Tensor *values = ctx.Input(1);
Tensor *shape = ctx.Input(2);
Tensor *start = ctx.Input(3);
Tensor *size = ctx.Input(4);
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "sparseslice check input and output number failed.");
KERNEL_HANDLE_ERROR(SparseSliceParamCheck(indices, values, shape, start, size), "sparseslice check params failed.");
const int input_dims = shape->NumElements();
auto shape_shape = shape->GetTensorShape();
std::vector<int64_t> dense_shape;
std::vector<int64_t> order;
int64_t output_size = 1;
for (int32_t index = 0; index < shape_shape->GetDimSize(0); ++index) {
if (shape->GetDataType() == DT_INT32) {
int32_t *temp_dim = static_cast<int32_t *>(shape->GetData());
dense_shape.emplace_back(static_cast<int64_t>(temp_dim[index]));
} else {
int64_t *temp_dim = static_cast<int64_t *>(shape->GetData());
dense_shape.emplace_back(temp_dim[index]);
}
order.push_back(dense_shape[index]);
output_size *= dense_shape[index];
}
std::iota(order.begin(), order.end(), 0);
SparseTensor st;
if (st.CreateSparseTensor(indices, values, dense_shape, order) != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("Create sparse tensor failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
aicpu::ArraySlice<int64_t> slice_start(input_dims, 0);
aicpu::ArraySlice<int64_t> slice_size(input_dims, 0);
auto start_val = static_cast<int64_t *>(start->GetData());
auto size_val = static_cast<int64_t *>(size->GetData());
for (int64_t i = 0; i < input_dims; i++) {
slice_start[i] = *(start_val + i);
}
for (int64_t i = 0; i < input_dims; i++) {
slice_size[i] = *(size_val + i);
}
Tensor *output_indices = ctx.Output(0);
Tensor *output_values = ctx.Output(1);
Tensor *output_dense_shape = ctx.Output(2);
DataType values_data_type = ctx.Input(1)->GetDataType();
KERNEL_LOG_DEBUG("%s op input[a] data type is [%s].", kSparseSlice, DTypeStr(values_data_type).c_str());
switch (values_data_type) {
case DT_INT64:
Slice<int64_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_INT32:
Slice<int32_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_UINT16:
Slice<uint16_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_INT16:
Slice<int16_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_UINT8:
Slice<uint8_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_INT8:
Slice<int8_t>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_FLOAT16:
Slice<Eigen::half>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_FLOAT:
Slice<float>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_DOUBLE:
Slice<double>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_COMPLEX64:
Slice<std::complex<float>>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_COMPLEX128:
Slice<std::complex<double>>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_BOOL:
Slice<bool>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
case DT_STRING:
Slice<std::string>(output_indices, output_values, output_dense_shape, &st, slice_start, slice_size);
break;
default:
KERNEL_LOG_ERROR("SparseSlice kernel data type [%s] not support.", DTypeStr(values_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t SparseSliceCpuKernel::SparseSliceParamCheck(Tensor *indices, Tensor *values, Tensor *shape, Tensor *start,
Tensor *size) {
auto indices_shape = indices->GetTensorShape();
KERNEL_CHECK_FALSE((IsMatrix(indices_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
"Input indeices must be a matrix.");
auto values_shape = values->GetTensorShape();
KERNEL_CHECK_FALSE((IsVector(values_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
"Input values must be a vector.");
auto shape_shape = shape->GetTensorShape();
std::vector<int64_t> shape_shape_vec;
int64_t *shape_vec = static_cast<int64_t *>(shape->GetData());
shape_shape_vec.push_back(*(shape_vec));
shape_shape_vec.push_back(*(shape_vec + 1));
KERNEL_CHECK_FALSE((IsVector(shape_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
" Input shape must be a vector.");
auto start_shape = start->GetTensorShape();
KERNEL_CHECK_FALSE((IsVector(start_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
"Input start must be a vector.");
auto size_shape = size->GetTensorShape();
KERNEL_CHECK_FALSE((IsVector(size_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID, "Input size must be a vector");
const int input_dims = shape->NumElements();
KERNEL_CHECK_FALSE((input_dims == start->NumElements()), KERNEL_STATUS_PARAM_INVALID,
"Expected start to be a vector of length [%s]", input_dims, "but get length [%s]",
start->NumElements());
KERNEL_CHECK_FALSE((input_dims == size->NumElements()), KERNEL_STATUS_PARAM_INVALID,
"Expected start to be a vector of length [%s]", input_dims, "but get length [%s]",
size->NumElements());
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSparseSlice, SparseSliceCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,24 @@
#ifndef OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
#define OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
#include <string>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "cpu_tensor.h"
#include "cpu_tensor_shape.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/eigen_tensor.h"
#include "utils/sparse_tensor.h"
namespace aicpu {
class SparseSliceCpuKernel : public CpuKernel {
public:
SparseSliceCpuKernel() = default;
~SparseSliceCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
uint32_t SparseSliceParamCheck(Tensor *indices, Tensor *values, Tensor *shape, Tensor *start, Tensor *size);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,145 @@
#include "sparse_slice_grad.h"
#include <complex>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "utils/sparse_tensor.h"
namespace {
const uint32_t kInputNum = 4;
const uint32_t kOutputNum = 1;
const char *kSparseSliceGrad = "SparseSliceGrad";
} // namespace
namespace aicpu {
uint32_t SparseSliceGradCpuKernel::Compute(CpuKernelContext &ctx) {
Tensor *backprop_val_grad = ctx.Input(0);
Tensor *indices = ctx.Input(1);
Tensor *start = ctx.Input(2);
Tensor *new_indices = ctx.Input(3);
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "sparseslicegrad check input and output number failed.");
KERNEL_HANDLE_ERROR(SparseSliceGradParamCheck(backprop_val_grad, indices, start, new_indices),
"sparseslicegrad check params failed.");
DataType input0_type = ctx.Input(0)->GetDataType();
KERNEL_LOG_DEBUG("%s op input[a] data type is [%s].", kSparseSliceGrad, DTypeStr(input0_type).c_str());
switch (input0_type) {
case DT_INT8:
GradCompute<int8_t>(ctx);
break;
case DT_UINT8:
GradCompute<uint8_t>(ctx);
break;
case DT_INT16:
GradCompute<int16_t>(ctx);
break;
case DT_UINT16:
GradCompute<uint16_t>(ctx);
break;
case DT_INT32:
GradCompute<int32_t>(ctx);
break;
case DT_INT64:
GradCompute<int64_t>(ctx);
break;
case DT_FLOAT:
GradCompute<float>(ctx);
break;
case DT_FLOAT16:
GradCompute<Eigen::half>(ctx);
break;
case DT_DOUBLE:
GradCompute<double>(ctx);
break;
case DT_COMPLEX64:
GradCompute<std::complex<float>>(ctx);
break;
case DT_COMPLEX128:
GradCompute<std::complex<double>>(ctx);
break;
default:
KERNEL_LOG_ERROR("SparseSliceGrad kernel data type [%s] not support.", DTypeStr(input0_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SparseSliceGradCpuKernel::GradCompute(CpuKernelContext &ctx) {
Tensor *backprop_val_grad = ctx.Input(0);
Tensor *indices = ctx.Input(1);
Tensor *start = ctx.Input(2);
Tensor *new_indices = ctx.Input(3);
auto indices_shape = indices->GetTensorShape();
const int64_t input_nnz = indices_shape->GetDimSize(0);
Tensor *y_grad = ctx.Output(0);
auto *y_grad_vec = static_cast<T *>(y_grad->GetData());
memset(y_grad_vec, 0, sizeof(T) * input_nnz);
std::vector<T> backprop_val_grad_flat;
auto *backprop_val_grad_vec = static_cast<T *>(backprop_val_grad->GetData());
const auto indices_mat = (EigenTensor(indices, indices->GetData())).matrix<int64_t>();
const auto new_indices_mat = (EigenTensor(new_indices, new_indices->GetData())).matrix<int64_t>();
EigenTensor start_ET(start, start->GetData());
const auto start_flat = start_ET.flat<int64_t>();
int64_t j = 0;
const int num_dims = indices_shape->GetDimSize(1);
for (int64_t i = 0; i < input_nnz && j < backprop_val_grad->NumElements(); ++i) {
bool is_same = true;
for (int d = 0; d < num_dims; ++d) {
const int64_t indices_value = indices_mat(i, d);
const int64_t new_indices_value = new_indices_mat(j, d);
const int64_t offset = start_flat(d);
if (indices_value != new_indices_value + offset) {
is_same = false;
break;
}
}
if (is_same) {
y_grad_vec[i] = *(backprop_val_grad_vec + j);
++j;
}
}
KERNEL_CHECK_FALSE((backprop_val_grad->NumElements() == j), KERNEL_STATUS_PARAM_INVALID,
"Elements of backprop_val_grad aren't all propagated."
"Num elements:",
backprop_val_grad->NumElements(), ", used: ", j);
return KERNEL_STATUS_OK;
}
uint32_t SparseSliceGradCpuKernel::SparseSliceGradParamCheck(Tensor *backprop_val_grad, Tensor *indices, Tensor *start,
Tensor *new_indices) {
KERNEL_CHECK_FALSE((IsVector(backprop_val_grad->GetTensorShape()->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
"Input backprop_val_grad should be a vector but received shape: [%d].",
backprop_val_grad->GetTensorShape()->GetDimSizes());
KERNEL_CHECK_FALSE(
(IsMatrix(indices->GetTensorShape()->GetDimSizes()) && IsMatrix(new_indices->GetTensorShape()->GetDimSizes())),
KERNEL_STATUS_PARAM_INVALID,
"Input and output indices should be matrices [%lld], but "
"received shapes: [%lld].",
indices->GetTensorShape()->GetDimSizes(), new_indices->GetTensorShape()->GetDimSizes());
auto indices_shape = indices->GetTensorShape();
auto new_indices_shape = new_indices->GetTensorShape();
KERNEL_CHECK_FALSE((indices_shape->GetDimSize(1) == new_indices_shape->GetDimSize(1)), KERNEL_STATUS_PARAM_INVALID,
"The input and output should have the same, ndims: got: [%d] and [%d].",
indices_shape->GetDimSize(1), new_indices_shape->GetDimSize(1));
KERNEL_CHECK_FALSE((new_indices_shape->GetDimSize(0) <= indices_shape->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
"# rows of output_indices should be not greater than of input_indices, "
"got: [%d] and [%d].",
new_indices_shape->GetDimSize(0), indices_shape->GetDimSize(0));
KERNEL_CHECK_FALSE((backprop_val_grad->NumElements() == new_indices_shape->GetDimSize(0)),
KERNEL_STATUS_PARAM_INVALID,
"# elements of backprop_val_grad and rows of new_indices should match "
"(#nnz of sum): got [%d] and [%d].",
backprop_val_grad->NumElements(), new_indices_shape->GetDimSize(0));
KERNEL_CHECK_FALSE((IsVector(start->GetTensorShape()->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
"The start should be a vector but received shape [%s].",
VectorToString(start->GetTensorShape()->GetDimSizes()).c_str());
const int num_dims = indices_shape->GetDimSize(1);
KERNEL_CHECK_FALSE((num_dims == start->NumElements()), KERNEL_STATUS_PARAM_INVALID,
"Expected start must be a vector of length [%d] but got length [%d].", num_dims,
start->NumElements());
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSparseSliceGrad, SparseSliceGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,20 @@
#ifndef OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
#define OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SparseSliceGradCpuKernel : public CpuKernel {
public:
SparseSliceGradCpuKernel() = default;
~SparseSliceGradCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
uint32_t SparseSliceGradParamCheck(Tensor *backprop_val_grad, Tensor *indices, Tensor *start, Tensor *new_indices);
template <typename T>
uint32_t GradCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,155 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_softmax.h"
#include <securec.h>
#include <iostream>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "utils/sparse_tensor.h"
namespace {
const uint32_t kSparseSoftmaxInputNum = 3;
const uint32_t kSparseSoftmaxOutputNum = 1;
const uint32_t kIndex0 = 0;
const uint32_t kIndex1 = 1;
const uint32_t kIndex2 = 2;
const uint32_t kSize1 = 1;
const uint32_t kSize2 = 2;
const char *kSparseSoftmax = "SparseSoftmax";
#define SPARSESOFTMAX_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = SparseSoftmaxCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SparseSoft kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t SparseSoftmaxCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSparseSoftmaxInputNum, kSparseSoftmaxOutputNum),
"[%s] check input and output failed.", kSparseSoftmax);
// parse params
KERNEL_HANDLE_ERROR(SparseSoftmaxCheck(ctx), "[%s] check params failed.", kSparseSoftmax);
auto data_type = ctx.Input(1)->GetDataType();
switch (data_type) {
SPARSESOFTMAX_COMPUTE_CASE(DT_FLOAT, float, ctx)
SPARSESOFTMAX_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("SparseSoftmax kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t SparseSoftmaxCpuKernel::SparseSoftmaxCheck(CpuKernelContext &ctx) {
std::vector<int64_t> shape_indices = ctx.Input(kIndex0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_values = ctx.Input(kIndex1)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_shape = ctx.Input(kIndex2)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_output = ctx.Output(kIndex0)->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE((shape_indices.size() == kSize2), KERNEL_STATUS_PARAM_INVALID,
"Indices must be rank 2D, got [%zu].", shape_indices.size())
KERNEL_CHECK_FALSE((shape_values.size() == kSize1), KERNEL_STATUS_PARAM_INVALID, "values must be rank 1D, got [%zu].",
shape_values.size())
KERNEL_CHECK_FALSE((shape_shape.size() == kSize1), KERNEL_STATUS_PARAM_INVALID, "shape must be rank 1D, got [%zu].",
shape_shape.size())
KERNEL_CHECK_FALSE((ctx.Input(kIndex2)->GetTensorShape()->NumElements() >= kSize2), KERNEL_STATUS_PARAM_INVALID,
"shape number must be more than 1, got [%zu].", shape_shape.size())
KERNEL_CHECK_FALSE((shape_values.size() == shape_output.size()), KERNEL_STATUS_PARAM_INVALID,
"The input shape size should be same as the output shape size")
const int64_t nnz = shape_indices[0];
const int64_t data_num = ctx.Input(kIndex1)->NumElements();
KERNEL_CHECK_FALSE((nnz == data_num), KERNEL_STATUS_PARAM_INVALID,
"The values number should be same as the indices_size(0)");
auto data_type_indices = ctx.Input(kIndex0)->GetDataType();
auto data_type_shape = ctx.Input(kIndex2)->GetDataType();
KERNEL_CHECK_FALSE((data_type_indices == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
"data type of indices should be int64");
KERNEL_CHECK_FALSE((data_type_shape == DT_INT64), KERNEL_STATUS_PARAM_INVALID, "data type of shape should be int64");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SparseSoftmaxCpuKernel::SparseSoftmaxCompute(CpuKernelContext &ctx) {
int64_t data_num = ctx.Input(kIndex1)->NumElements();
auto *indices_t = ctx.Input(kIndex0);
auto *values_t = ctx.Input(kIndex1);
auto *shape_t = ctx.Input(kIndex2);
auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
std::vector<int64_t> shape_indices = ctx.Input(0)->GetTensorShape()->GetDimSizes();
const int64_t nnz = shape_indices[0];
const int64_t rank = static_cast<int64_t>(shape_indices[1]);
SparseTensor st;
std::vector<int64_t> order;
std::vector<int64_t> shape_flat;
int64_t *temp_dim = reinterpret_cast<int64_t *>(shape_t->GetData());
for (int32_t index = 0; index < shape_t->GetTensorShape()->GetDimSize(0); ++index) {
shape_flat.emplace_back(temp_dim[index]);
order.push_back(shape_flat[index]);
}
std::iota(order.begin(), order.end(), 0);
if (st.CreateSparseTensor(indices_t, values_t, shape_flat, order) != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("Create sparse tensor failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
Eigen::Tensor<T, 1, Eigen::RowMajor> output_flat(nnz);
// { 0, ..., rank-1 }.
std::vector<int64_t> kReorderDims(rank);
std::iota(kReorderDims.begin(), kReorderDims.end(), 0);
// All but the last dim -- the class dimension to be max-reduced along.
std::vector<int64_t> kGroupByDims(rank - 1);
std::iota(kGroupByDims.begin(), kGroupByDims.end(), 0);
st.Reorder<T>(kReorderDims);
int64_t count = 0;
for (const auto &g : st.group(kGroupByDims)) {
const auto group_vals = g.values<T>();
const int group_size = group_vals.size();
Eigen::Tensor<T, 0, Eigen::RowMajor> tmp_scalar;
tmp_scalar = group_vals.maximum();
Eigen::Tensor<T, 1, Eigen::RowMajor> tmp(group_size);
tmp = (group_vals - tmp.constant(tmp_scalar())).exp();
tmp_scalar = tmp.sum().inverse();
tmp = tmp * tmp.constant(tmp_scalar());
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> output_part(output_flat.data() + count, group_size);
output_part = tmp;
count += group_size;
}
for (int64_t index = 0; index < data_num; ++index) {
output_data[index] = static_cast<T>(output_flat[index]);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSparseSoftmax, SparseSoftmaxCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSESOFTMAX_H_
#define AICPU_KERNELS_NORMALIZED_SPARSESOFTMAX_H_
#include "cpu_ops_kernel.h"
#include "utils/sparse_tensor.h"
namespace aicpu {
class SparseSoftmaxCpuKernel : public CpuKernel {
public:
SparseSoftmaxCpuKernel() = default;
~SparseSoftmaxCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t SparseSoftmaxCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t SparseSoftmaxCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,226 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_tensor_dense_add.h"
#include <float.h>
#include <securec.h>
#include <complex>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "iostream"
#include "kernel_log.h"
#include "status.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kSparseTensorDenseAdd = "SparseTensorDenseAdd";
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 4;
// when input data size is more than kParallelDataNum, use Parallel func
constexpr uint64_t kParallelDataNums = 256 * 1024;
} // namespace
namespace aicpu {
uint32_t SparseTensorDenseAddCpuKernel::Compute(CpuKernelContext &ctx) {
if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("Check SparseTensorDenseAdd params failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
uint32_t res = ValidateInputs(ctx);
if (res != KERNEL_STATUS_OK) {
return KERNEL_STATUS_PARAM_INVALID;
}
DataType data_type = static_cast<DataType>(ctx.Input(1)->GetDataType());
switch (data_type) {
case DT_INT8:
return DoCompute<int8_t>(ctx);
case DT_UINT8:
return DoCompute<uint8_t>(ctx);
case DT_INT16:
return DoCompute<int16_t>(ctx);
case DT_UINT16:
return DoCompute<uint16_t>(ctx);
case DT_INT32:
return DoCompute<int32_t>(ctx);
case DT_INT64:
return DoCompute<int64_t>(ctx);
case DT_FLOAT16:
return DoCompute<Eigen::half>(ctx);
case DT_FLOAT:
return DoCompute<float>(ctx);
case DT_DOUBLE:
return DoCompute<double>(ctx);
case DT_COMPLEX64:
return DoCompute<std::complex<float>>(ctx);
case DT_COMPLEX128:
return DoCompute<std::complex<double>>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
template <typename T>
uint32_t SparseTensorDenseAddCpuKernel::DoCompute(CpuKernelContext &ctx) {
Tensor *a_indices = ctx.Input(0);
Tensor *a_values = ctx.Input(1);
Tensor *b = ctx.Input(3);
Tensor *out = ctx.Output(0);
const int NDIMS = static_cast<int>(a_indices->GetTensorShape()->GetDimSize(1));
auto b_data = reinterpret_cast<T *>(b->GetData());
auto out_data = reinterpret_cast<T *>(out->GetData());
auto value_data = reinterpret_cast<T *>(a_values->GetData());
const auto ix_ = std::make_shared<EigenTensor>(a_indices, a_indices->GetData());
DataType dt = static_cast<DataType>(a_indices->GetDataType());
uint32_t data_num = out->NumElements();
if (data_num <= kParallelDataNums) {
for (size_t i = 0; i < data_num; i++) {
out_data[i] = b_data[i];
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shared_sparsetensordenseadd = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
out_data[i] = b_data[i];
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_sparsetensordenseadd);
}
const int num_nnz = static_cast<int>(a_indices->GetTensorShape()->GetDimSize(0));
std::vector<int64_t> strides(NDIMS);
if (NDIMS > 0) {
strides[NDIMS - 1] = 1;
}
for (int d = NDIMS - 2; d >= 0; --d) {
const int64_t dimsize = b->GetTensorShape()->GetDimSize(d + 1);
strides[d] = strides[d + 1] * dimsize;
}
for (int i = 0; i < num_nnz; ++i) {
int64_t ix = 0;
for (int d = 0; d < NDIMS; ++d) {
int64_t ix_i_d = 0;
if (dt == DT_INT32) {
auto a_indices_mat = ix_->matrix<int32_t>();
ix_i_d = a_indices_mat(i, d);
} else {
auto a_indices_mat = ix_->matrix<int64_t>();
ix_i_d = a_indices_mat(i, d);
}
ix += strides[d] * ix_i_d;
}
out_data[ix] += value_data[i];
}
return KERNEL_STATUS_OK;
}
uint32_t SparseTensorDenseAddCpuKernel::ValidateInputs(CpuKernelContext &ctx) {
Tensor *a_indices_t = ctx.Input(0);
Tensor *a_values_t = ctx.Input(1);
Tensor *a_shape_t = ctx.Input(2);
Tensor *b_t = ctx.Input(3);
Tensor *out_t = ctx.Output(0);
const int a_indices_shape_dims = 2;
DataType input0_dt = a_values_t->GetDataType();
DataType input1_dt = b_t->GetDataType();
DataType input2_dt = out_t->GetDataType();
const int ndims = static_cast<int>(a_indices_t->GetTensorShape()->GetDimSize(1));
const int min_ndims = 1;
const int max_ndims = 5;
if (ndims < min_ndims || ndims > max_ndims) {
KERNEL_LOG_ERROR("Only tensors with ranks between 1 and 5 are currently supported. Tensor rank: [%d]", ndims);
return KERNEL_STATUS_PARAM_INVALID;
}
// valid data type
if (input0_dt != input1_dt || input1_dt != input2_dt) {
KERNEL_LOG_ERROR("x1_values data type[%s], x2 data type[%s] and y data type[%s] must be same.",
DTypeStr(input0_dt).c_str(), DTypeStr(input1_dt).c_str(), DTypeStr(input2_dt).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
int32_t IndiceType = a_indices_t->GetDataType();
int32_t ShapeType = a_shape_t->GetDataType();
bool validIndiceType = (IndiceType != DT_INT64 && IndiceType != DT_INT32);
bool validShapeType = (ShapeType != DT_INT64 && ShapeType != DT_INT32);
if (validShapeType || validIndiceType) {
KERNEL_LOG_ERROR(
"Valid indice or shape data type failed, indiceType [%d], shapeType "
"[%d].",
IndiceType, ShapeType);
return KERNEL_STATUS_PARAM_INVALID;
}
if (IndiceType != ShapeType) {
KERNEL_LOG_ERROR(
"Indice type and shape type should be same, indiceType [%d], shapeType "
"[%d].",
IndiceType, ShapeType);
return KERNEL_STATUS_PARAM_INVALID;
}
// valid data shape
if (a_indices_t->GetTensorShape()->GetDims() != a_indices_shape_dims) {
KERNEL_LOG_ERROR("Input a_indices should be a matrix but get dim size: [%d].",
a_indices_t->GetTensorShape()->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
if (a_values_t->GetTensorShape()->GetDims() != 1 || a_shape_t->GetTensorShape()->GetDims() != 1) {
KERNEL_LOG_ERROR(
"Inputs a_values and a_shape should be vectors but received shapes: "
"[%d] and [%d]",
a_values_t->GetTensorShape()->GetDims(), a_shape_t->GetTensorShape()->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
if (a_shape_t->NumElements() != b_t->GetTensorShape()->GetDims() ||
out_t->GetTensorShape()->GetDims() != b_t->GetTensorShape()->GetDims()) {
KERNEL_LOG_ERROR(
"Three operands have different ranks; received: [%lld] , [%lld] and "
"[%lld]",
a_shape_t->NumElements(), b_t->GetTensorShape()->GetDims(), out_t->GetTensorShape()->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
std::shared_ptr<EigenTensor> a_shape_ = std::make_shared<EigenTensor>(a_shape_t, a_shape_t->GetData());
if (IndiceType == DT_INT32) {
auto a_shape_flat = a_shape_->vec<int32_t>();
for (int i = 0; i < b_t->GetTensorShape()->GetDims(); i++) {
if (out_t->GetTensorShape()->GetDimSize(i) != b_t->GetTensorShape()->GetDimSize(i) ||
a_shape_flat(i) != b_t->GetTensorShape()->GetDimSize(i)) {
KERNEL_LOG_ERROR(
"Dimension [%d] does not equal (no broadcasting is supported): y "
"side [%lld] vs x2 shape side [%lld] vs x1 shape side [%lld]",
i, out_t->GetTensorShape()->GetDimSize(i), b_t->GetTensorShape()->GetDimSize(i), a_shape_flat(i));
return KERNEL_STATUS_PARAM_INVALID;
}
}
} else {
auto a_shape_flat = a_shape_->vec<int64_t>();
for (int i = 0; i < b_t->GetTensorShape()->GetDims(); i++) {
if (out_t->GetTensorShape()->GetDimSize(i) != b_t->GetTensorShape()->GetDimSize(i) ||
a_shape_flat(i) != b_t->GetTensorShape()->GetDimSize(i)) {
KERNEL_LOG_ERROR(
"Dimension [%d] does not equal (no broadcasting is supported): y "
"side [%lld] vs x2 shape side [%lld] vs x1 shape side [%lld]",
i, out_t->GetTensorShape()->GetDimSize(i), b_t->GetTensorShape()->GetDimSize(i), a_shape_flat(i));
return KERNEL_STATUS_PARAM_INVALID;
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSparseTensorDenseAdd, SparseTensorDenseAddCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,39 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_TENSOR_DENSE_ADD_H_
#define AICPU_KERNELS_NORMALIZED_SPARSE_TENSOR_DENSE_ADD_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SparseTensorDenseAddCpuKernel : public CpuKernel {
private:
/* data */
public:
SparseTensorDenseAddCpuKernel() = default;
~SparseTensorDenseAddCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
uint32_t ValidateInputs(CpuKernelContext &ctx);
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,225 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <algorithm>
#include <iostream>
#include <type_traits>
#include <vector>
#include "cpu_kernel_utils.h"
#include "sparse_tensor_dense_mat_mul.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
using namespace std;
namespace {
#define COL_SHED 1024 << 1
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 4;
const char *kSparseTensorDenseMatMul = "SparseTensorDenseMatMul";
} // namespace
namespace aicpu {
uint32_t SparseTensorDenseMatMulCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"SparseTensorDenseMatMul check input and output number failed.");
KERNEL_HANDLE_ERROR(SparseTensorDenseMatMulCheck(ctx), "SparseTensorDenseMatMul check params failed.");
DataType sparse_data_type = ctx.Input(1)->GetDataType();
DataType indice_data_type = ctx.Input(0)->GetDataType();
DataType dense_data_type = ctx.Input(3)->GetDataType();
DataType y_data_type = ctx.Output(0)->GetDataType();
if (sparse_data_type == DT_FLOAT && indice_data_type == DT_INT64 && dense_data_type == DT_FLOAT &&
y_data_type == DT_FLOAT)
regular_calculate<float, int64_t, float, float>(ctx);
else if (sparse_data_type == DT_FLOAT && indice_data_type == DT_INT64 && dense_data_type == DT_COMPLEX64 &&
y_data_type == DT_COMPLEX64)
regular_calculate<float, int64_t, complex<float>, complex<float>>(ctx);
else if (sparse_data_type == DT_FLOAT && indice_data_type == DT_INT32 && dense_data_type == DT_FLOAT &&
y_data_type == DT_FLOAT)
regular_calculate<float, int32_t, float, float>(ctx);
else if (sparse_data_type == DT_FLOAT && indice_data_type == DT_INT32 && dense_data_type == DT_COMPLEX64 &&
y_data_type == DT_COMPLEX64)
regular_calculate<float, int32_t, complex<float>, complex<float>>(ctx);
else if (sparse_data_type == DT_DOUBLE && indice_data_type == DT_INT64 && dense_data_type == DT_DOUBLE &&
y_data_type == DT_DOUBLE)
regular_calculate<double, int64_t, double, double>(ctx);
else if (sparse_data_type == DT_DOUBLE && indice_data_type == DT_INT64 && dense_data_type == DT_COMPLEX128 &&
y_data_type == DT_COMPLEX128)
regular_calculate<double, int64_t, complex<double>, complex<double>>(ctx);
else if (sparse_data_type == DT_DOUBLE && indice_data_type == DT_INT32 && dense_data_type == DT_DOUBLE &&
y_data_type == DT_DOUBLE)
regular_calculate<double, int32_t, double, double>(ctx);
else if (sparse_data_type == DT_DOUBLE && indice_data_type == DT_INT32 && dense_data_type == DT_COMPLEX128 &&
y_data_type == DT_COMPLEX128)
regular_calculate<double, int32_t, complex<double>, complex<double>>(ctx);
else if (sparse_data_type == DT_INT64 && indice_data_type == DT_INT64 && dense_data_type == DT_INT64 &&
y_data_type == DT_INT64)
regular_calculate<int64_t, int64_t, int64_t, int64_t>(ctx);
else if (sparse_data_type == DT_INT64 && indice_data_type == DT_INT32 && dense_data_type == DT_INT64 &&
y_data_type == DT_INT64)
regular_calculate<int64_t, int32_t, int64_t, int64_t>(ctx);
else if (sparse_data_type == DT_INT32 && indice_data_type == DT_INT64 && dense_data_type == DT_INT32 &&
y_data_type == DT_INT32)
regular_calculate<int32_t, int64_t, int32_t, int32_t>(ctx);
else if (sparse_data_type == DT_INT32 && indice_data_type == DT_INT32 && dense_data_type == DT_INT32 &&
y_data_type == DT_INT32)
regular_calculate<int32_t, int32_t, int32_t, int32_t>(ctx);
else if (sparse_data_type == DT_COMPLEX64 && indice_data_type == DT_INT64 && dense_data_type == DT_FLOAT &&
y_data_type == DT_COMPLEX64)
regular_calculate<complex<float>, int64_t, float, complex<float>>(ctx);
else if (sparse_data_type == DT_COMPLEX64 && indice_data_type == DT_INT64 && dense_data_type == DT_COMPLEX64 &&
y_data_type == DT_COMPLEX64)
regular_calculate<complex<float>, int64_t, complex<float>, complex<float>>(ctx);
else if (sparse_data_type == DT_COMPLEX64 && indice_data_type == DT_INT32 && dense_data_type == DT_FLOAT &&
y_data_type == DT_COMPLEX64)
regular_calculate<complex<float>, int32_t, float, complex<float>>(ctx);
else if (sparse_data_type == DT_COMPLEX64 && indice_data_type == DT_INT32 && dense_data_type == DT_COMPLEX64 &&
y_data_type == DT_COMPLEX64)
regular_calculate<complex<float>, int32_t, complex<float>, complex<float>>(ctx);
else if (sparse_data_type == DT_COMPLEX128 && indice_data_type == DT_INT64 && dense_data_type == DT_DOUBLE &&
y_data_type == DT_COMPLEX128)
regular_calculate<complex<double>, int64_t, double, complex<double>>(ctx);
else if (sparse_data_type == DT_COMPLEX128 && indice_data_type == DT_INT64 && dense_data_type == DT_COMPLEX128 &&
y_data_type == DT_COMPLEX128)
regular_calculate<complex<double>, int64_t, complex<double>, complex<double>>(ctx);
else if (sparse_data_type == DT_COMPLEX128 && indice_data_type == DT_INT32 && dense_data_type == DT_DOUBLE &&
y_data_type == DT_COMPLEX128)
regular_calculate<complex<double>, int32_t, double, complex<double>>(ctx);
else if (sparse_data_type == DT_COMPLEX128 && indice_data_type == DT_INT32 && dense_data_type == DT_COMPLEX128 &&
y_data_type == DT_COMPLEX128)
regular_calculate<complex<double>, int32_t, complex<double>, complex<double>>(ctx);
else if (sparse_data_type == DT_FLOAT16 && indice_data_type == DT_INT64 && dense_data_type == DT_FLOAT16 &&
y_data_type == DT_FLOAT16)
regular_calculate<Eigen::half, int64_t, Eigen::half, Eigen::half>(ctx);
else if (sparse_data_type == DT_FLOAT16 && indice_data_type == DT_INT32 && dense_data_type == DT_FLOAT16 &&
y_data_type == DT_FLOAT16)
regular_calculate<Eigen::half, int32_t, Eigen::half, Eigen::half>(ctx);
else {
KERNEL_LOG_ERROR(
"sparse_tensor_dense_mat_mul kernel wrong datatype."
"sparse_data_type [%s],"
"indices_data_type [%s],"
"dense_data_type [%s],"
"y_data_type [%s].",
DTypeStr(sparse_data_type).c_str(), DTypeStr(indice_data_type).c_str(), DTypeStr(dense_data_type).c_str(),
DTypeStr(y_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <class SparseType, class IndicesType, class DenseType, class OutputType>
uint32_t SparseTensorDenseMatMulCpuKernel::regular_calculate(CpuKernelContext &ctx) {
Tensor *x1_indices = ctx.Input(0);
Tensor *x1_values = ctx.Input(1);
Tensor *x1_shape = ctx.Input(2);
Tensor *x2 = ctx.Input(3);
Tensor *y = ctx.Output(0);
auto x1_indices_shape = x1_indices->GetTensorShape();
auto x2_shape = x2->GetTensorShape();
auto y_shape = y->GetTensorShape();
int64_t *x1_shape_data = (int64_t *)x1_shape->GetData();
uint64_t x1_row = x1_shape_data[0];
uint64_t x1_col = x1_shape_data[1];
uint64_t x2_row = x2_shape->GetDimSize(0);
uint64_t x2_col = x2_shape->GetDimSize(1);
AttrValue *adjoint_a = ctx.GetAttr("adjoint_a");
AttrValue *adjoint_b = ctx.GetAttr("adjoint_b");
SparseType *x1_values_data = (SparseType *)x1_values->GetData();
DenseType *x2_data = (DenseType *)x2->GetData();
OutputType *y_data = (OutputType *)y->GetData();
uint64_t y_data_len = y->NumElements();
for (uint64_t i = 0; i < y_data_len; i++) {
y_data[i] = static_cast<OutputType>(0);
}
if (adjoint_a->GetBool()) {
swap(x1_row, x1_col);
}
if (adjoint_b->GetBool()) {
swap(x2_row, x2_col);
}
uint64_t pairs = x1_indices_shape->GetDimSize(0);
IndicesType *x1_indices_data = (IndicesType *)x1_indices->GetData();
for (uint64_t i = 0; i < pairs; i++) {
uint64_t row = x1_indices_data[i << 1], col = x1_indices_data[1 + (i << 1)];
SparseType a = x1_values_data[i];
if (adjoint_a->GetBool()) {
swap(row, col);
}
KERNEL_CHECK_FALSE(row >= 0 && row < x1_row && col >= 0 && col < x1_col, KERNEL_STATUS_PARAM_INVALID,
"sparse size invalid.")
if (x2_col < COL_SHED) {
for (uint64_t j = 0; j < x2_col; j++) {
uint64_t idx = adjoint_b->GetBool() ? (j * x2_row + col) : (col * x2_col + j);
DenseType b = x2_data[idx];
if constexpr (std::is_same<DenseType, complex<double>>::value || std::is_same<DenseType, complex<float>>::value)
if (adjoint_b->GetBool()) {
b = conj(b);
}
y_data[row * x2_col + j] += a * b;
}
continue;
}
uint32_t min_core = 1;
uint64_t max_core = std::max(min_core, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
max_core = std::min(max_core, x2_col);
auto fun = [&](size_t s, size_t t) {
for (uint64_t j = s; j < t; j++) {
uint64_t idx = adjoint_b->GetBool() ? (j * x2_row + col) : (col * x2_col + j);
DenseType b = x2_data[idx];
if constexpr (std::is_same<DenseType, complex<double>>::value || std::is_same<DenseType, complex<float>>::value)
if (adjoint_b->GetBool()) {
b = conj(b);
}
y_data[row * x2_col + j] += a * b;
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x2_col, x2_col / max_core, fun),
"SparseTensorDenseMatMul Compute failed.");
}
return KERNEL_STATUS_OK;
}
uint32_t SparseTensorDenseMatMulCpuKernel::SparseTensorDenseMatMulCheck(CpuKernelContext &ctx) {
Tensor *x1_indices = ctx.Input(0);
Tensor *x1_values = ctx.Input(1);
Tensor *x1_shape = ctx.Input(2);
Tensor *x2 = ctx.Input(3);
Tensor *y = ctx.Output(0);
AttrValue *adjoint_a = ctx.GetAttr("adjoint_a"), *adjoint_b = ctx.GetAttr("adjoint_b");
KERNEL_CHECK_NULLPTR(x1_indices, KERNEL_STATUS_PARAM_INVALID, "Get input 0 failed.")
KERNEL_CHECK_NULLPTR(x1_values, KERNEL_STATUS_PARAM_INVALID, "Get input 1 failed.")
KERNEL_CHECK_NULLPTR(x1_shape, KERNEL_STATUS_PARAM_INVALID, "Get input 2 failed.")
KERNEL_CHECK_NULLPTR(x2, KERNEL_STATUS_PARAM_INVALID, "Get input 3 failed.")
KERNEL_CHECK_NULLPTR(y, KERNEL_STATUS_PARAM_INVALID, "Get output 0 failed.")
KERNEL_CHECK_NULLPTR(adjoint_a, KERNEL_STATUS_PARAM_INVALID, "Get attribute adjoint_a failed.")
KERNEL_CHECK_NULLPTR(adjoint_b, KERNEL_STATUS_PARAM_INVALID, "Get attribute adjoint_b failed.")
KERNEL_CHECK_FALSE(x1_indices->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
KERNEL_CHECK_FALSE(x1_values->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
KERNEL_CHECK_FALSE(x1_indices->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 2 data failed.")
KERNEL_CHECK_FALSE(x2->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 3 data failed.")
KERNEL_CHECK_FALSE(y->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed.")
KERNEL_CHECK_FALSE(x1_shape->GetDataType() == DT_INT64, KERNEL_STATUS_PARAM_INVALID, "x1_shape must be DT_INT64")
KERNEL_CHECK_FALSE(x1_shape->GetTensorShape()->GetDims() == 1 && x1_shape->NumElements() == 2 &&
x1_indices->GetTensorShape()->GetDimSize(0) == x1_values->NumElements(),
KERNEL_STATUS_PARAM_INVALID, "sparse tensor x1 dimension error.")
KERNEL_CHECK_FALSE(x2->GetTensorShape()->GetDims() == 2, KERNEL_STATUS_PARAM_INVALID, "matrix x2 dimension error.")
int64_t *x1_shape_data = (int64_t *)x1_shape->GetData();
uint64_t x1_col = x1_shape_data[!adjoint_a->GetBool()];
uint64_t x2_row = x2->GetTensorShape()->GetDimSize(adjoint_b->GetBool());
KERNEL_CHECK_FALSE(x1_col == x2_row, KERNEL_STATUS_PARAM_INVALID, "can not do matrix multiplication.")
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSparseTensorDenseMatMul, SparseTensorDenseMatMulCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,36 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_TENSOR_DENSE_MAT_MUL_H_
#define AICPU_KERNELS_NORMALIZED_SPARSE_TENSOR_DENSE_MAT_MUL_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SparseTensorDenseMatMulCpuKernel : public CpuKernel {
public:
SparseTensorDenseMatMulCpuKernel() = default;
~SparseTensorDenseMatMulCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <class SparseType, class IndicesType, class DenseType, class OutputType>
static uint32_t regular_calculate(CpuKernelContext &ctx);
static uint32_t SparseTensorDenseMatMulCheck(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,200 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_tensor_to_csr_sparse_matrix.h"
#include <complex>
#include <numeric>
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 3;
const uint32_t kOutputNum = 5;
const char *SparseTensorToCSRSparseMatrix = "SparseTensorToCSRSparseMatrix";
const int DIM2 = 2;
const int DIM3 = 3;
} // namespace
namespace aicpu {
uint32_t SparseTensorToCSRSparseMatrixCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseTensorToCSRSparseMatrix normal check failed.");
Tensor *x_indices = ctx.Input(0);
Tensor *x_values = ctx.Input(1);
Tensor *x_dense_shape = ctx.Input(2);
const int rank = x_dense_shape->NumElements();
if (rank != DIM2 && rank != DIM3) {
KERNEL_LOG_ERROR("SparseTensor must have rank 2 or 3.");
return KERNEL_STATUS_PARAM_INVALID;
}
auto x_indices_shape = x_indices->GetTensorShape();
auto x_values_shape = x_values->GetTensorShape();
if (x_indices_shape->NumElements() / rank != x_values_shape->NumElements()) {
KERNEL_LOG_ERROR("Tensor x_indices&x_values's ranks mismatch.");
return KERNEL_STATUS_PARAM_INVALID;
}
auto x_dense_shape_data_type = x_dense_shape->GetDataType();
auto x_indices_data_type = x_indices->GetDataType();
if (x_indices_data_type != DT_INT32 && x_indices_data_type != DT_INT64) {
KERNEL_LOG_ERROR("SparseTensorToCSRSparseMatrix kernel data type [%s] not support.",
DTypeStr(x_indices_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (x_dense_shape_data_type != x_indices_data_type) {
KERNEL_LOG_ERROR("SparseTensorToCSRSparseMatrix kernel data type mismatch.");
return KERNEL_STATUS_PARAM_INVALID;
}
auto x_values_data_type = x_values->GetDataType();
uint32_t status;
switch (x_indices_data_type) {
case DT_INT32:
switch (x_values_data_type) {
case DT_FLOAT:
status = ComputeKernel<int32_t, float>(ctx);
break;
case DT_DOUBLE:
status = ComputeKernel<int32_t, double>(ctx);
break;
case DT_COMPLEX64:
status = ComputeKernel<int32_t, std::complex<float> >(ctx);
break;
case DT_COMPLEX128:
status = ComputeKernel<int32_t, std::complex<double> >(ctx);
break;
default:
KERNEL_LOG_ERROR(
"SparseTensorToCSRSparseMatrix kernel data type [%s] not "
"support.",
DTypeStr(x_values_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
case DT_INT64:
switch (x_values_data_type) {
case DT_FLOAT:
status = ComputeKernel<int64_t, float>(ctx);
break;
case DT_DOUBLE:
status = ComputeKernel<int64_t, double>(ctx);
break;
case DT_COMPLEX64:
status = ComputeKernel<int64_t, std::complex<float> >(ctx);
break;
case DT_COMPLEX128:
status = ComputeKernel<int64_t, std::complex<double> >(ctx);
break;
default:
KERNEL_LOG_ERROR(
"SparseTensorToCSRSparseMatrix kernel data type [%s] not "
"support.",
DTypeStr(x_values_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
default:
KERNEL_LOG_ERROR("data type of indices is not int32 or int64");
return KERNEL_STATUS_PARAM_INVALID;
}
if (status != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("SparseTensorToCSRSparseMatrix kernel compute failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(SparseTensorToCSRSparseMatrix, SparseTensorToCSRSparseMatrixCpuKernel);
template <typename indicesT, typename dataT>
uint32_t SparseTensorToCSRSparseMatrixCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
auto x_dense_shape = ctx.Input(2);
auto x_dense_shape_ptr = static_cast<indicesT *>(x_dense_shape->GetData());
auto y_dense_shape_ptr = static_cast<indicesT *>(ctx.Output(0)->GetData());
auto x_values_ptr = static_cast<dataT *>(ctx.Input(1)->GetData());
auto y_values_ptr = static_cast<dataT *>(ctx.Output(4)->GetData());
// Copy the CSRSparseMatrix's dense_shape and values from the SparseTensor.
for (int64_t i = 0; i < x_dense_shape->GetTensorShape()->NumElements(); i++) {
y_dense_shape_ptr[i] = x_dense_shape_ptr[i];
}
for (int64_t i = 0; i < ctx.Input(1)->GetTensorShape()->NumElements(); i++) {
y_values_ptr[i] = x_values_ptr[i];
}
auto y_batch_pointers_ptr = static_cast<indicesT *>(ctx.Output(1)->GetData());
auto y_row_pointers_ptr = static_cast<indicesT *>(ctx.Output(2)->GetData());
auto y_col_indices_ptr = static_cast<indicesT *>(ctx.Output(3)->GetData());
auto x_indices_ptr = static_cast<indicesT *>(ctx.Input(0)->GetData());
const int rank = ctx.Input(2)->NumElements();
const int64_t batch_size = (rank == DIM2) ? 1 : x_dense_shape_ptr[0];
const int64_t num_rows = x_dense_shape_ptr[(rank == DIM2) ? 0 : 1];
const int64_t total_nnz = ctx.Input(1)->NumElements();
for (int64_t i = 0; i < batch_size * (num_rows + 1); i++) {
y_row_pointers_ptr[i] = 0;
}
int64_t prev_batch = -1;
if (rank == DIM2) {
// For a single batch, the batch_ptrs are {0, total_nnz}.
y_batch_pointers_ptr[0] = 0;
++prev_batch;
for (int64_t i = 0; i < total_nnz; ++i) {
// For now, the rows pointers store the corresponding row counts.
int64_t offset = i * rank;
y_row_pointers_ptr[x_indices_ptr[offset] + 1] += 1;
y_col_indices_ptr[i] = x_indices_ptr[++offset];
}
} else { // rank == 3
for (int64_t i = 0; i < total_nnz; ++i) {
int64_t offset = i * rank;
const int cur_batch = x_indices_ptr[offset];
// For now, the rows pointers store the corresponding row counts.
y_row_pointers_ptr[cur_batch * (num_rows + 1) + x_indices_ptr[++offset] + 1] += 1;
y_col_indices_ptr[i] = x_indices_ptr[++offset];
// We're at a new batch and might have skipped over empty batches.
while (prev_batch < cur_batch) {
// The previous batch ends at position i.
y_batch_pointers_ptr[prev_batch + 1] = i;
++prev_batch;
}
}
}
// Set the last element of batch_ptr and account for trailing empty batches.
while (prev_batch < batch_size) {
y_batch_pointers_ptr[prev_batch + 1] = total_nnz;
++prev_batch;
}
// Compute the cumulative row counts for each batch.
for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
auto *row_ptr_batch = y_row_pointers_ptr + batch_idx * (num_rows + 1);
std::partial_sum(row_ptr_batch, row_ptr_batch + num_rows + 1, row_ptr_batch);
}
return KERNEL_STATUS_OK;
}
} // namespace aicpu

View File

@ -0,0 +1,34 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SparseTensorToCSRSparseMatrix_H_
#define AICPU_KERNELS_NORMALIZED_SparseTensorToCSRSparseMatrix_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SparseTensorToCSRSparseMatrixCpuKernel : public CpuKernel {
public:
~SparseTensorToCSRSparseMatrixCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename indicesT, typename dataT>
uint32_t ComputeKernel(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,605 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sspaddmm.h"
#include <complex>
#include <iostream>
#include "utils/eigen_tensor.h"
namespace aicpu {
const char *SSPADDMM = "Sspaddmm";
#define SPADDMM_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = SspaddmmCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Sspaddmm kernel compute failed."); \
return result; \
} \
break; \
}
// scalar * sparse matrix for beta * input alpha * mat1
template <typename T>
T *SspaddmmCpuKernel::ScalarSparseMul(CpuKernelContext &ctx, Tensor *vals, Tensor *scalar) {
T scalar_val;
auto scalar_val_addr = scalar->GetData();
switch (scalar->GetDataType()) {
case DT_UINT8:
scalar_val = static_cast<T>(reinterpret_cast<uint8_t *>(scalar_val_addr)[0]);
break;
case DT_UINT16:
scalar_val = static_cast<T>(reinterpret_cast<uint16_t *>(scalar_val_addr)[0]);
break;
case DT_UINT32:
scalar_val = static_cast<T>(reinterpret_cast<uint32_t *>(scalar_val_addr)[0]);
break;
case DT_UINT64:
scalar_val = static_cast<T>(reinterpret_cast<uint64_t *>(scalar_val_addr)[0]);
break;
case DT_INT8:
scalar_val = static_cast<T>(reinterpret_cast<int8_t *>(scalar_val_addr)[0]);
break;
case DT_INT16:
scalar_val = static_cast<T>(reinterpret_cast<int16_t *>(scalar_val_addr)[0]);
break;
case DT_INT32:
scalar_val = static_cast<T>(reinterpret_cast<int32_t *>(scalar_val_addr)[0]);
break;
case DT_INT64:
scalar_val = static_cast<T>(reinterpret_cast<int64_t *>(scalar_val_addr)[0]);
break;
case DT_FLOAT16:
scalar_val = static_cast<T>(reinterpret_cast<Eigen::half *>(scalar_val_addr)[0]);
break;
case DT_FLOAT:
scalar_val = static_cast<T>(reinterpret_cast<float *>(scalar_val_addr)[0]);
break;
case DT_DOUBLE:
scalar_val = static_cast<T>(reinterpret_cast<double *>(scalar_val_addr)[0]);
break;
case DT_BOOL:
scalar_val = static_cast<T>(reinterpret_cast<bool *>(scalar_val_addr)[0]);
break;
case DT_COMPLEX64:
scalar_val = static_cast<T>(reinterpret_cast<std::complex<float> *>(scalar_val_addr)[0].real());
break;
case DT_COMPLEX128:
scalar_val = static_cast<T>(reinterpret_cast<std::complex<double> *>(scalar_val_addr)[0].real());
break;
default:
KERNEL_LOG_ERROR("For Sspaddm, scalar dtype %s not support", DTypeStr(scalar->GetDataType()).c_str());
return nullptr;
}
T *val_addr = reinterpret_cast<T *>(vals->GetData());
uint32_t data_num = vals->GetTensorShape()->GetDimSize(0);
T *val_addr_bak = new T[data_num];
if (data_num >= kParallelDataNumSameShape_) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
if (data_num <= kParallelDataNumSameShapeMid_) {
max_core_num = std::min(max_core_num, 4U);
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto multi = [&val_addr, &val_addr_bak, scalar_val](uint32_t start, uint32_t end) {
for (uint32_t idx = start; idx < end; idx++) {
val_addr_bak[idx] = val_addr[idx] * scalar_val;
}
};
max_core_num = max_core_num == 0 ? 1 : max_core_num;
CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, multi);
} else {
// no match for operator*= (operand types are Eigen::half and float)
for (uint32_t idx = 0; idx < data_num; idx++) {
val_addr_bak[idx] = val_addr[idx] * scalar_val;
}
}
return val_addr_bak;
}
template <typename T>
void SspaddmmCpuKernel::Clear(Tensor *tensor, CpuKernelContext &ctx) {
T *addr = reinterpret_cast<T *>(tensor->GetData());
uint32_t num = tensor->GetTensorShape()->GetDimSize(0);
if (num >= kParallelDataNumSameShape_) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
if (num <= kParallelDataNumSameShapeMid_) {
max_core_num = std::min(max_core_num, 4U);
}
if (max_core_num > num) {
max_core_num = num;
}
auto multi = [&addr](uint32_t start, uint32_t end) {
for (uint32_t idx = start; idx < end; idx++) {
addr[idx] = static_cast<T>(0);
}
};
max_core_num = max_core_num == 0 ? 1 : max_core_num;
CpuKernelUtils::ParallelFor(ctx, num, num / max_core_num, multi);
} else {
// no match for operator*= (operand types are Eigen::half and float)
for (uint32_t idx = 0; idx < num; idx++) {
addr[idx] = static_cast<T>(0);
}
}
}
template <typename T>
void SspaddmmCpuKernel::ClearIndices(Tensor *tensor, CpuKernelContext &ctx) {
T *addr = reinterpret_cast<T *>(tensor->GetData());
uint32_t num = 2 * tensor->GetTensorShape()->GetDimSize(1);
if (num >= kParallelDataNumSameShape_) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
if (num <= kParallelDataNumSameShapeMid_) {
max_core_num = std::min(max_core_num, 4U);
}
if (max_core_num > num) {
max_core_num = num;
}
auto multi = [&addr](uint32_t start, uint32_t end) {
for (uint32_t idx = start; idx < end; idx++) {
addr[idx] = static_cast<T>(0);
}
};
max_core_num = max_core_num == 0 ? 1 : max_core_num;
CpuKernelUtils::ParallelFor(ctx, num, num / max_core_num, multi);
} else {
// no match for operator*= (operand types are Eigen::half and float)
for (uint32_t idx = 0; idx < num; idx++) {
addr[idx] = static_cast<T>(0);
}
}
}
template <typename T1>
uint32_t SspaddmmCpuKernel::BoundaryCheck(Tensor *tensor, Tensor *shape_tensor, int64_t nums, CpuKernelContext &ctx) {
int64_t row;
int64_t col;
if (shape_tensor->GetDataType() == DT_INT32) {
int32_t *in_dim = reinterpret_cast<int32_t *>(shape_tensor->GetData());
row = static_cast<int64_t>(in_dim[0]);
col = static_cast<int64_t>(in_dim[1]);
} else {
int64_t *in_dim = reinterpret_cast<int64_t *>(shape_tensor->GetData());
row = in_dim[0];
col = in_dim[1];
}
if (row <= 0 || col <= 0) {
KERNEL_LOG_ERROR("For sspaddmm, sparse tensor shape should be positive num but get [%d, %d]", row, col);
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t row_tmp, col_tmp;
T1 *addr = reinterpret_cast<T1 *>(tensor->GetData());
uint32_t data_num = static_cast<uint32_t>(nums);
if (data_num >= kParallelDataNumSameShape_) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
if (data_num <= kParallelDataNumSameShapeMid_) {
max_core_num = std::min(max_core_num, 4U);
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto multi = [&](uint32_t start, uint32_t end) {
for (uint32_t i = start; i < end; i++) {
row_tmp = static_cast<int64_t>(addr[i]);
col_tmp = static_cast<int64_t>(addr[i + data_num]);
if (row_tmp >= row || row_tmp < 0) {
KERNEL_LOG_ERROR("For sspaddmm, sparse tensor indices row index [%d] out of range[0, %d]", row_tmp, row);
return KERNEL_STATUS_PARAM_INVALID;
}
if (col_tmp >= col || col_tmp < 0) {
KERNEL_LOG_ERROR("For sspaddmm, sparse tensor indices col index [%d] out of range[0, %d]", col_tmp, col);
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_PARAM_INVALID;
};
max_core_num = max_core_num == 0 ? 1 : max_core_num;
CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, multi);
return KERNEL_STATUS_OK;
} else {
for (uint32_t i = 0; i < data_num; i++) {
row_tmp = static_cast<int64_t>(addr[i]);
col_tmp = static_cast<int64_t>(addr[i + data_num]);
if (row_tmp >= row || row_tmp < 0) {
KERNEL_LOG_ERROR("For sspaddmm, sparse tensor indices row index [%d] out of range[0, %d]", row_tmp, row);
return KERNEL_STATUS_PARAM_INVALID;
}
if (col_tmp >= col || col_tmp < 0) {
KERNEL_LOG_ERROR("For sspaddmm, sparse tensor indices col index [%d] out of range[0, %d]", col_tmp, col);
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
}
// sparse matrix multiply dense matrix
template <typename T_idx, typename T>
uint32_t SspaddmmCpuKernel::SparseMulDense(CpuKernelContext &ctx, Tensor *mat1_indices_tensor, T *mat1_val_addr,
Tensor *mat2_values_tensor, Tensor *output_indices_tensor,
Tensor *output_values_tensor, const int64_t row, const int64_t mat2_col) {
const int mat1_vals_num = mat1_indices_tensor->GetTensorShape()->GetDimSize(1);
// the result of mat1 @ mat2 will write to output directly
T_idx *mat1_idx_addr = reinterpret_cast<T_idx *>(mat1_indices_tensor->GetData());
T *mat2_val_addr = reinterpret_cast<T *>(mat2_values_tensor->GetData());
int64_t *out_idx_addr = reinterpret_cast<int64_t *>(output_indices_tensor->GetData());
T *out_val_addr = reinterpret_cast<T *>(output_values_tensor->GetData());
int out_num = output_indices_tensor->GetTensorShape()->GetDimSize(1);
std::unordered_map<T_idx, std::unordered_map<int64_t, uint32_t>> idx_map_cnt;
std::unordered_map<T_idx, std::vector<T_idx>> unrepeated;
std::unordered_map<T_idx, std::unordered_map<T_idx, std::vector<T>>> co_map_idx;
// unrepeated : [1 -> [0], 2 -> [1, 2]]
// co_map_idx : [1][0] -> 0.3
for (int64_t i = 0; i < mat1_vals_num; i++) {
T_idx _row = mat1_idx_addr[i];
T_idx _col = mat1_idx_addr[i + mat1_vals_num];
unrepeated[_row].push_back(_col);
co_map_idx[_row][_col].push_back(mat1_val_addr[i]);
for (uint32_t j = 0; j < mat2_col; j++) {
if (idx_map_cnt[_row][j] == 0) {
idx_map_cnt[_row][j] = this->cnt_;
this->cnt_++;
}
}
}
std::vector<T_idx> res;
for (auto it = unrepeated.begin(); it != unrepeated.end(); it++) {
res.push_back(it->first);
}
uint32_t n_unreapeat = unrepeated.size();
if (n_unreapeat * mat2_col > kParallelDataNumSameShape_) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
if (n_unreapeat <= kParallelDataNumSameShape_) {
max_core_num = std::min(max_core_num, 4U);
}
if (max_core_num > n_unreapeat) {
max_core_num = n_unreapeat;
}
auto multi = [&](uint32_t start, uint32_t end) {
for (uint32_t i = start; i < end; i++) {
// get val
auto row_mat1 = res[i];
for (auto row_mat2 : unrepeated[row_mat1]) {
T val = co_map_idx[row_mat1][row_mat2].back();
co_map_idx[row_mat1][row_mat2].pop_back();
for (int64_t j = 0; j < mat2_col; j++) {
// get val
T_idx idx = idx_map_cnt[row_mat1][j];
*(out_val_addr + idx) += val * mat2_val_addr[row_mat2 * mat2_col + j];
out_idx_addr[idx] = static_cast<int64_t>(row_mat1);
out_idx_addr[idx + out_num] = j;
}
}
}
};
max_core_num = max_core_num == 0 ? 1 : max_core_num;
CpuKernelUtils::ParallelFor(ctx, n_unreapeat, n_unreapeat / max_core_num, multi);
} else {
for (uint32_t i = 0; i < n_unreapeat; i++) {
// get val
auto row_mat1 = res[i];
for (auto row_mat2 : unrepeated[row_mat1]) {
T val = co_map_idx[row_mat1][row_mat2].back();
co_map_idx[row_mat1][row_mat2].pop_back();
for (int64_t j = 0; j < mat2_col; j++) {
// get val
T_idx idx = idx_map_cnt[row_mat1][j];
*(out_val_addr + idx) += val * mat2_val_addr[row_mat2 * mat2_col + j];
out_idx_addr[idx] = static_cast<int64_t>(row_mat1);
out_idx_addr[idx + out_num] = j;
}
}
}
}
return KERNEL_STATUS_OK;
}
// sparse matrix add sparse matrix
// input + mat1 @ mat2
template <typename T_idx, typename T>
uint32_t SspaddmmCpuKernel::SparseAddSparse(CpuKernelContext &ctx, Tensor *input_indices_tensor, T *in_val_addr,
Tensor *output_indices_tensor, Tensor *output_values_tensor) {
// to implement m1[row][col] = vals
uint32_t input_nums = input_indices_tensor->GetTensorShape()->GetDimSize(1);
this->cnt_ = input_nums;
// get output vals and index addr
T *out_val_addr = reinterpret_cast<T *>(output_values_tensor->GetData());
int64_t *out_idx_addr = reinterpret_cast<int64_t *>(output_indices_tensor->GetData());
int out_num = output_indices_tensor->GetTensorShape()->GetDimSize(1);
// if input idx not in output, will append at the end of output
T_idx *input_addr = reinterpret_cast<T_idx *>(input_indices_tensor->GetData());
if (input_nums >= kParallelDataNumSameShape_) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, CpuKernelUtils::GetCPUNum(ctx));
if (input_nums <= kParallelDataNumSameShapeMid_) {
max_core_num = std::min(max_core_num, 4U);
}
if (max_core_num > input_nums) {
max_core_num = input_nums;
}
auto multi = [&](uint32_t start, uint32_t end) {
for (uint32_t i = start; i < end; i++) {
auto row = input_addr[i];
auto col = input_addr[i + input_nums];
// else append it at the end
out_val_addr[i] = in_val_addr[i];
// copy indices[0]
out_idx_addr[i] = row;
// copy indices[1]
out_idx_addr[i + out_num] = col;
}
};
max_core_num = max_core_num == 0 ? 1 : max_core_num;
CpuKernelUtils::ParallelFor(ctx, input_nums, input_nums / max_core_num, multi);
} else {
for (uint32_t i = 0; i < input_nums; i++) {
auto row = input_addr[i];
auto col = input_addr[i + input_nums];
// else append it at the end
out_val_addr[i] = in_val_addr[i];
// copy indices[0]
out_idx_addr[i] = row;
// copy indices[1]
out_idx_addr[i + out_num] = col;
}
}
return KERNEL_STATUS_OK;
}
int64_t SspaddmmCpuKernel::GetIndicesNum(Tensor *tensor) {
if (tensor->GetDataType() == DT_INT32) {
int32_t *a = reinterpret_cast<int32_t *>(tensor->GetData());
return static_cast<int64_t>(a[1]);
}
int64_t *a = reinterpret_cast<int64_t *>(tensor->GetData());
return a[1];
}
template <typename T>
uint32_t SspaddmmCpuKernel::SspaddmmCompute(CpuKernelContext &ctx) {
Tensor *input_indices_tensor = ctx.Input(0);
Tensor *input_values_tensor = ctx.Input(1);
Tensor *input_shapes_tensor = ctx.Input(2);
Tensor *mat1_indices_tensor = ctx.Input(3);
Tensor *mat1_values_tensor = ctx.Input(4);
Tensor *mat1_shapes_tensor = ctx.Input(5);
Tensor *mat2_values_tensor = ctx.Input(6);
Tensor *alpha_tensor = ctx.Input(7);
Tensor *beta_tensor = ctx.Input(8);
Tensor *output_indices_tensor = ctx.Output(0);
Tensor *output_values_tensor = ctx.Output(1);
Clear<T>(output_values_tensor, ctx);
ClearIndices<int64_t>(output_indices_tensor, ctx);
// scalar * sparse inplace
T *input_values_addr_bak = ScalarSparseMul<T>(ctx, input_values_tensor, beta_tensor);
T *mat1_values_addr_bak = ScalarSparseMul<T>(ctx, mat1_values_tensor, alpha_tensor);
// sparse * mat write to output directly
auto row = GetIndicesNum(mat1_shapes_tensor);
auto col = GetIndicesNum(input_shapes_tensor);
// sparse + sparse
if (input_indices_tensor->GetDataType() == DT_INT32) {
SparseAddSparse<int32_t, T>(ctx, input_indices_tensor, input_values_addr_bak, output_indices_tensor,
output_values_tensor);
} else {
SparseAddSparse<int64_t, T>(ctx, input_indices_tensor, input_values_addr_bak, output_indices_tensor,
output_values_tensor);
}
if (mat1_indices_tensor->GetDataType() == DT_INT32) {
SparseMulDense<int32_t, T>(ctx, mat1_indices_tensor, mat1_values_addr_bak, mat2_values_tensor,
output_indices_tensor, output_values_tensor, row, col);
} else {
SparseMulDense<int64_t, T>(ctx, mat1_indices_tensor, mat1_values_addr_bak, mat2_values_tensor,
output_indices_tensor, output_values_tensor, row, col);
}
return KERNEL_STATUS_OK;
}
uint32_t SspaddmmCpuKernel::ValidParam(CpuKernelContext &ctx) {
// valid input and output nullptr
Tensor *input_indices_tensor = ctx.Input(0);
Tensor *input_values_tensor = ctx.Input(1);
Tensor *input_shapes_tensor = ctx.Input(2);
Tensor *mat1_indices_tensor = ctx.Input(3);
Tensor *mat1_values_tensor = ctx.Input(4);
Tensor *mat1_shapes_tensor = ctx.Input(5);
Tensor *mat2_tensor = ctx.Input(6);
Tensor *alpha_tensor = ctx.Input(7);
Tensor *beta_tensor = ctx.Input(8);
Tensor *output_indices_tensor = ctx.Output(0);
Tensor *output_values_tensor = ctx.Output(1);
Tensor *output_shapes_tensor = ctx.Output(2);
// valid shape nullptr
auto mat1_values_shape = mat1_values_tensor->GetTensorShape();
auto mat1_shapes_shape = mat1_shapes_tensor->GetTensorShape();
auto mat1_indices_shape = mat1_indices_tensor->GetTensorShape();
auto mat2_shapes_shape = mat2_tensor->GetTensorShape();
auto input_values_shape = input_values_tensor->GetTensorShape();
auto input_shapes_shape = input_shapes_tensor->GetTensorShape();
auto input_indices_shape = input_indices_tensor->GetTensorShape();
auto output_values_shape = output_values_tensor->GetTensorShape();
auto output_shapes_shape = output_shapes_tensor->GetTensorShape();
auto output_indices_shape = output_indices_tensor->GetTensorShape();
auto alpha_shape = alpha_tensor->GetTensorShape();
auto beta_shape = beta_tensor->GetTensorShape();
// sparse_indices
// GetDims() will return dims number, uint32_t
if (mat1_indices_shape->GetDims() != 2) {
KERNEL_LOG_ERROR(
"Mat1 sparse_indices should be 2D, got dim "
"size [%d].",
mat1_indices_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
if (input_indices_shape->GetDims() != 2) {
KERNEL_LOG_ERROR(
"Input sparse_indices should be 2D, got dim "
"size [%d].",
input_indices_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
if (output_indices_shape->GetDims() != 2) {
KERNEL_LOG_ERROR(
"Output sparse_indices should be 2D, got dim "
"size [%d].",
input_indices_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
// valid data type
int32_t mat1_IndiceType = mat1_indices_tensor->GetDataType();
int32_t input_IndiceType = input_indices_tensor->GetDataType();
int32_t output_IndiceType = output_indices_tensor->GetDataType();
int32_t mat1_ShapeType = mat1_shapes_tensor->GetDataType();
int32_t input_ShapeType = input_shapes_tensor->GetDataType();
int32_t output_ShapeType = output_shapes_tensor->GetDataType();
bool validIndiceType = ((mat1_IndiceType == DT_INT32) || (mat1_IndiceType == DT_INT64)) &&
((output_IndiceType == DT_INT32) || (output_IndiceType == DT_INT64)) &&
((input_IndiceType == DT_INT32) || (input_IndiceType == DT_INT64));
bool validShapeType = ((mat1_ShapeType == DT_INT32) || (mat1_ShapeType == DT_INT64)) &&
((output_ShapeType == DT_INT32) || (output_ShapeType == DT_INT64)) &&
((input_ShapeType == DT_INT32) || (input_ShapeType == DT_INT64));
if (!validShapeType || !validIndiceType) {
KERNEL_LOG_ERROR(
"Valid indice and shape data type failed, "
"indiceType and shapeType should be INT32 or INT64");
return KERNEL_STATUS_PARAM_INVALID;
}
// sparse_values' number check
int32_t mat1_values_dims_size = mat1_values_shape->GetDims();
int32_t input_values_dims_size = input_values_shape->GetDims();
if ((mat1_values_dims_size != 0) && (mat1_values_dims_size != 1)) {
KERNEL_LOG_ERROR(
"mat1 values_shape should be a scalar or a vector, "
"got dim size [%d].",
mat1_values_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
if ((input_values_dims_size != 0) && (input_values_dims_size != 1)) {
KERNEL_LOG_ERROR(
"input values_shape should be a scalar or a vector, "
"got dim size [%d].",
input_values_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t mat1_elems_num = mat1_indices_shape->GetDims() > 0 ? mat1_indices_shape->GetDimSize(1) : 1;
int64_t input_elems_num = input_indices_shape->GetDims() > 0 ? input_indices_shape->GetDimSize(1) : 1;
if ((mat1_values_dims_size == 1) && (mat1_values_tensor->NumElements() != mat1_elems_num)) {
KERNEL_LOG_ERROR(
"mat1 values_shape has incorrect number of elements [%lld], "
"should be [%lld]",
mat1_values_tensor->NumElements(), mat1_elems_num);
return KERNEL_STATUS_PARAM_INVALID;
}
if ((input_values_dims_size == 1) && (input_values_tensor->NumElements() != input_elems_num)) {
KERNEL_LOG_ERROR(
"input values_shape has incorrect number of elements [%lld], "
"should be [%lld]",
input_values_tensor->NumElements(), input_elems_num);
return KERNEL_STATUS_PARAM_INVALID;
}
if (alpha_shape->GetDims() > 1) {
KERNEL_LOG_ERROR("alpha should be a scalar or vector but get dim num [%d]", alpha_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
if (beta_shape->GetDims() > 1) {
KERNEL_LOG_ERROR("beta should be a scalar or vector but get dim num [%d]", alpha_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
uint32_t status = KERNEL_STATUS_OK;
if (input_indices_tensor->GetDataType() == DT_INT32) {
status = BoundaryCheck<int32_t>(input_indices_tensor, input_shapes_tensor, input_values_tensor->NumElements(), ctx);
} else {
status = BoundaryCheck<int64_t>(input_indices_tensor, input_shapes_tensor, input_values_tensor->NumElements(), ctx);
}
if (status != KERNEL_STATUS_OK) {
return status;
}
if (mat1_indices_tensor->GetDataType() == DT_INT32) {
status = BoundaryCheck<int32_t>(mat1_indices_tensor, mat1_shapes_tensor, mat1_values_tensor->NumElements(), ctx);
} else {
status = BoundaryCheck<int64_t>(mat1_indices_tensor, mat1_shapes_tensor, mat1_values_tensor->NumElements(), ctx);
}
return status;
}
uint32_t SspaddmmCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, this->kInputNum, this->kOutputNum),
"Sspaddmm check input and output number failed.");
if (ValidParam(ctx) != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("Valid sparse to dense param error.");
return KERNEL_STATUS_PARAM_INVALID;
}
Tensor *input_shapes_tensor = ctx.Input(2);
Tensor *output_values_tensor = ctx.Output(1);
Tensor *output_shapes_tensor = ctx.Output(2);
int64_t *ou_dim = reinterpret_cast<int64_t *>(output_shapes_tensor->GetData());
if (input_shapes_tensor->GetDataType() == DT_INT32) {
int32_t *in_dim = reinterpret_cast<int32_t *>(input_shapes_tensor->GetData());
for (int32_t index = 0; index < 2; ++index) {
ou_dim[index] = in_dim[index];
}
} else {
int64_t *in_dim = reinterpret_cast<int64_t *>(input_shapes_tensor->GetData());
for (int32_t index = 0; index < 2; ++index) {
ou_dim[index] = in_dim[index];
}
}
auto output_dtype = output_values_tensor->GetDataType();
switch (output_dtype) {
SPADDMM_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
SPADDMM_COMPUTE_CASE(DT_INT8, int8_t, ctx)
SPADDMM_COMPUTE_CASE(DT_INT16, int16_t, ctx)
SPADDMM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
SPADDMM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
SPADDMM_COMPUTE_CASE(DT_FLOAT, float_t, ctx)
SPADDMM_COMPUTE_CASE(DT_DOUBLE, double_t, ctx)
default:
KERNEL_LOG_ERROR("Sspaddmm kernel data type [%s] not support.", DTypeStr(output_dtype).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(SSPADDMM, SspaddmmCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,66 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SSPADDMM_H_
#define AICPU_KERNELS_NORMALIZED_SSPADDMM_H_
#include <deque>
#include <set>
#include <unordered_map>
#include <vector>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "utils/kernel_util.h"
namespace aicpu {
class SspaddmmCpuKernel : public CpuKernel {
public:
SspaddmmCpuKernel() = default;
uint32_t ValidParam(CpuKernelContext &ctx);
virtual ~SspaddmmCpuKernel() = default;
uint32_t cnt_ = 0;
const uint32_t kInputNum = 9;
const uint32_t kOutputNum = 3;
const int64_t kParallelDataNumSameShape_ = 14 * 1024;
const int64_t kParallelDataNumSameShapeMid_ = 7 * 1024;
template <typename T>
void Clear(Tensor *tensor, CpuKernelContext &ctx);
template <typename T>
void ClearIndices(Tensor *tensor, CpuKernelContext &ctx);
template <typename T1>
uint32_t BoundaryCheck(Tensor *, Tensor *, int64_t, CpuKernelContext &);
template <typename T>
uint32_t SspaddmmCompute(CpuKernelContext &ctx);
template <typename T_idx, typename T>
uint32_t SparseAddSparse(CpuKernelContext &ctx, Tensor *input_indices_tensor, T *in_val_addr,
Tensor *output_indices_tensor, Tensor *output_values_tensor);
template <typename T_idx, typename T>
uint32_t SparseMulDense(CpuKernelContext &ctx, Tensor *mat1_indices_tensor, T *mat1_val_addr,
Tensor *mat2_values_tensor, Tensor *output_indices_tensor, Tensor *output_values_tensor,
const int64_t row, const int64_t col);
template <typename T>
T *ScalarSparseMul(CpuKernelContext &ctx, Tensor *vals, Tensor *scalar);
int64_t GetIndicesNum(Tensor *tensor);
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
}; // namespace CpuKernel
}; // namespace aicpu
#endif

View File

@ -0,0 +1,98 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "trace.h"
#include "cpu_kernel_utils.h"
#include "cstring"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 1;
const uint32_t kOutputNum = 1;
const uint32_t InputShapeDim = 2;
const uint32_t OutputShapeDim = 1;
const uint64_t OutputShapeDimSize = 1;
const char *kTrace = "Trace";
#define TRACE_COMPUTE_CASE(DTYPE, INPUT, OUTPUT, CTX, TYPE) \
case (DTYPE): { \
uint32_t result = TraceCompute<TYPE>(INPUT, OUTPUT, CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Trace kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t TraceCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Trace check input and output number failed.");
Tensor *input_tensor = ctx.Input(0);
KERNEL_CHECK_NULLPTR(input_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "Trace get input data failed.")
KERNEL_CHECK_NULLPTR(input_tensor->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Trace get input shape failed")
if (input_tensor->GetTensorShape()->GetDims() != InputShapeDim) {
KERNEL_LOG_ERROR("Trace input dim must be 2!");
return KERNEL_STATUS_PARAM_INVALID;
}
// check output tensor
Tensor *output_tensor = ctx.Output(0);
KERNEL_CHECK_NULLPTR(output_tensor, KERNEL_STATUS_PARAM_INVALID, "Trace get output failed.")
KERNEL_CHECK_NULLPTR(output_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "Trace get output data failed.")
KERNEL_CHECK_NULLPTR(output_tensor->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Trace get output shape failed")
auto input_dtype = input_tensor->GetDataType();
auto output_dtype = output_tensor->GetDataType();
switch (input_dtype) {
TRACE_COMPUTE_CASE(DT_INT8, input_tensor, output_tensor, ctx, int8_t)
TRACE_COMPUTE_CASE(DT_UINT8, input_tensor, output_tensor, ctx, uint8_t)
TRACE_COMPUTE_CASE(DT_INT16, input_tensor, output_tensor, ctx, int16_t)
TRACE_COMPUTE_CASE(DT_UINT16, input_tensor, output_tensor, ctx, uint16_t)
TRACE_COMPUTE_CASE(DT_INT32, input_tensor, output_tensor, ctx, int32_t)
TRACE_COMPUTE_CASE(DT_UINT32, input_tensor, output_tensor, ctx, uint32_t)
TRACE_COMPUTE_CASE(DT_INT64, input_tensor, output_tensor, ctx, int64_t)
TRACE_COMPUTE_CASE(DT_UINT64, input_tensor, output_tensor, ctx, uint64_t)
TRACE_COMPUTE_CASE(DT_FLOAT16, input_tensor, output_tensor, ctx, Eigen::half)
TRACE_COMPUTE_CASE(DT_FLOAT, input_tensor, output_tensor, ctx, float)
TRACE_COMPUTE_CASE(DT_DOUBLE, input_tensor, output_tensor, ctx, double)
default:
KERNEL_LOG_ERROR("Trace kernel data type [%u] not support", output_dtype);
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t TraceCpuKernel::TraceCompute(Tensor *input, Tensor *output, CpuKernelContext &ctx) {
auto inputDataAddr = reinterpret_cast<T *>(input->GetData());
auto outputDataAddr = reinterpret_cast<T *>(output->GetData());
auto input_shape = ctx.Input(0)->GetTensorShape();
int64_t inputLine = input_shape->GetDimSize(0), inputCol = input_shape->GetDimSize(1);
auto min_shape = std::min(inputLine, inputCol);
memset(outputDataAddr, 0, sizeof(T));
for (int64_t i = 0; i < min_shape; i++) {
*(outputDataAddr) += *(inputDataAddr + i * inputCol + i);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kTrace, TraceCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LESS_H_
#define AICPU_KERNELS_NORMALIZED_LESS_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class TraceCpuKernel : public CpuKernel {
public:
TraceCpuKernel() = default;
~TraceCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
static uint32_t TraceCompute(Tensor *input, Tensor *output, CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,177 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "trace_grad.h"
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
#include "utils/eigen_tensor.h"
#include "Eigen/Core"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kTraceGrad = "TraceGrad";
} // namespace
// 定义命名空间aicpu
namespace aicpu {
// 实现自定义算子类的Compute函数
uint32_t TraceGradCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Tracegrad check input and output number failed.");
Tensor *y_grad = ctx.Input(0);
Tensor *x_shape = ctx.Input(1);
Tensor *x_grad = ctx.Output(0);
KERNEL_CHECK_NULLPTR(y_grad->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.");
KERNEL_CHECK_NULLPTR(x_shape->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.");
KERNEL_CHECK_NULLPTR(x_grad->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed");
KERNEL_CHECK_FALSE(ctx.Input(1)->NumElements() == 2, KERNEL_STATUS_PARAM_INVALID, "Expected matrix input.",
ctx.Input(1)->NumElements());
KERNEL_LOG_DEBUG(
"TraceGradCpuKernel[%s], y_grad: size[%llu];"
"x_shape: size[%llu], x_grad: size[%llu].",
ctx.GetOpType().c_str(), y_grad->GetDataSize(), x_shape->GetDataSize(), x_grad->GetDataSize());
DataType data_type = ctx.Input(0)->GetDataType();
DataType shape_type = ctx.Input(1)->GetDataType();
switch (data_type) {
case DT_INT8:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<int8_t, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<int8_t, int64_t>(ctx);
default:
break;
}
case DT_INT16:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<int16_t, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<int16_t, int64_t>(ctx);
default:
break;
}
case DT_INT32:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<int32_t, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<int32_t, int64_t>(ctx);
default:
break;
}
case DT_INT64:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<int64_t, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<int64_t, int64_t>(ctx);
default:
break;
}
case DT_UINT8:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<uint8_t, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<uint8_t, int64_t>(ctx);
default:
break;
}
case DT_UINT16:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<uint16_t, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<uint16_t, int64_t>(ctx);
default:
break;
}
case DT_UINT32:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<uint32_t, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<uint32_t, int64_t>(ctx);
default:
break;
}
case DT_UINT64:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<uint64_t, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<uint64_t, int64_t>(ctx);
default:
break;
}
case DT_FLOAT16:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<Eigen::half, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<Eigen::half, int64_t>(ctx);
default:
break;
}
case DT_FLOAT:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<float, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<float, int64_t>(ctx);
default:
break;
}
case DT_DOUBLE:
switch (shape_type) {
case DT_INT32:
return TraceGradCompute<double, int32_t>(ctx);
case DT_INT64:
return TraceGradCompute<double, int64_t>(ctx);
default:
break;
}
default:
KERNEL_LOG_ERROR("TraceGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t TraceGradCpuKernel::TraceGradCompute(CpuKernelContext &ctx) {
auto input_x1 = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
auto input_x2 = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
auto output_x = reinterpret_cast<T1 *>(ctx.Output(0)->GetData());
T2 m = *(input_x2);
T2 n = *(input_x2 + 1);
T1 *grad_input = new T1[m * n];
for (T2 i = 0; i < m * n; i++) *(grad_input + i) = (T1)0;
for (T2 i = 0; i < m; i++)
for (T2 j = 0; j < n; j++) {
if (i == j) *(grad_input + i * n + j) = *(input_x1);
}
for (T2 i = 0; i < m * n; i++) *(output_x + i) = *(grad_input + i);
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kTraceGrad, TraceGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,21 @@
#ifndef AICPU_KERNELS_NORMALIZED_TRACEGRAD_H_
#define AICPU_KERNELS_NORMALIZED_TRACEGARD_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class TraceGradCpuKernel : public CpuKernel {
public:
TraceGradCpuKernel() = default;
~TraceGradCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2>
static uint32_t TraceGradCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,414 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tridiagonal_solve.h"
#include <iostream>
#include "Eigen/Core"
#include "Eigen/LU"
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "utils/kernel_util.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/eigen_tensor.h"
using namespace Eigen;
using namespace std;
namespace {
const char *TRIDIAGONALSOLVE = "TridiagonalSolve";
// 是否启用多线程的标识分界点
const int64_t kParallelDataNumSameShape = 8 * 1024;
} // namespace
// 定义命名空间aicpu
namespace aicpu {
// 读取输入输出以及exception抛出
uint32_t TridiagonalSolveCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
// get 输入输出指针
diags_tensor_ = ctx.Input(0);
rhs_tensor_ = ctx.Input(1);
output_tensor_ = ctx.Output(0);
KERNEL_HANDLE_ERROR(NormalCheck(ctx, 2, 1), "Less check input and output number failed.");
// get shape指针
std::shared_ptr<TensorShape> diags_shape = diags_tensor_->GetTensorShape();
KERNEL_CHECK_NULLPTR(diags_shape, KERNEL_STATUS_PARAM_INVALID, "Get shape of input[0], diags failed");
std::shared_ptr<TensorShape> rhs_shape = rhs_tensor_->GetTensorShape();
KERNEL_CHECK_NULLPTR(rhs_shape, KERNEL_STATUS_PARAM_INVALID, "Get shape of input[1], rhs failed");
// get 输入维度
int32_t diags_rank = diags_shape->GetDims();
int32_t rhs_rank = rhs_shape->GetDims();
// get diags和rhs矩阵的尺寸
rhs_size =
rhs_tensor_->GetTensorShape()->GetDimSize(rhs_rank - 1) * rhs_tensor_->GetTensorShape()->GetDimSize(rhs_rank - 2);
diags_size = diags_tensor_->GetTensorShape()->GetDimSize(diags_rank - 1) *
diags_tensor_->GetTensorShape()->GetDimSize(diags_rank - 2);
// get shape的size vector
std::vector<int64_t> diags_dimsize = diags_shape->GetDimSizes();
std::vector<int64_t> rhs_dimsize = rhs_shape->GetDimSizes();
// get partial_pivoting
partial_pivoting = ctx.GetAttr("partial_pivoting");
// get diags_type_和rhs_type_
diags_type_ = static_cast<DataType>(diags_tensor_->GetDataType());
rhs_type_ = static_cast<DataType>(rhs_tensor_->GetDataType());
// get data_type_
data_type_ = rhs_type_;
// exception抛出
//  1) 维度小于2
if (diags_rank < 2) {
KERNEL_LOG_ERROR("Expected diags to have rank at least 2, got[%d]", diags_rank);
return KERNEL_STATUS_PARAM_INVALID;
}
// 2) diags和rhs维度不匹配
if (rhs_rank != diags_rank) {
KERNEL_LOG_ERROR("Expected the rank of rhs to be [%d] or [%d], got [%d]", diags_rank - 1, diags_rank, rhs_rank);
return KERNEL_STATUS_PARAM_INVALID;
}
//  3) diags没有三行
DimSize0 = diags_shape->GetDimSize(diags_rank - 2);
if (DimSize0 != 3) {
KERNEL_LOG_ERROR("Expected 3 diagonals got [%d]", DimSize0);
return KERNEL_STATUS_PARAM_INVALID;
}
// 4) batch_size不一致
for (int i = 0; i < diags_rank - 2; i++) {
if (diags_dimsize[i] != rhs_dimsize[i]) {
KERNEL_LOG_ERROR("Batch shapes of diags and rhs are incompatible");
return KERNEL_STATUS_PARAM_INVALID;
}
}
//  5) diags和rhs类型不一致
if (diags_type_ != rhs_type_) {
KERNEL_LOG_ERROR("The type of diags and rhs are incompatible");
return KERNEL_STATUS_PARAM_INVALID;
}
//  6) 输入为空
if (diags_dimsize.size() == 0 || rhs_dimsize.size() == 0) {
KERNEL_LOG_ERROR("The input is null");
return KERNEL_STATUS_PARAM_INVALID;
}
// 7)diags和rhs长度不匹配
int DimSize1 = diags_shape->GetDimSize(diags_rank - 1);
int RhsSize0 = rhs_shape->GetDimSize(rhs_rank - 2);
if (DimSize1 != RhsSize0) {
KERNEL_LOG_ERROR("The length of diags and rhs are incompatible");
return KERNEL_STATUS_PARAM_INVALID;
}
// 8) 输入的数据类型无法处理
if (diags_type_ != DT_FLOAT && diags_type_ != DT_DOUBLE && diags_type_ != DT_COMPLEX64 &&
diags_type_ != DT_COMPLEX128) {
KERNEL_LOG_ERROR("The type of inputs are invalid");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
// 根据数据类型为模板函数赋不同的值,并且根据partial_pivoting的值选择不同的计算方法
uint32_t TridiagonalSolveCpuKernel::choosedatatype_(CpuKernelContext &ctx, size_t nth_batch, int i) {
if (partial_pivoting->GetBool()) {
switch (data_type_) {
case DT_FLOAT: {
res = DoCompute1<float>(ctx, nth_batch, i);
break;
}
case DT_DOUBLE: {
res = DoCompute1<double>(ctx, nth_batch, i);
break;
}
case DT_COMPLEX64: {
res = DoCompute1<std::complex<float>>(ctx, nth_batch, i);
break;
}
case DT_COMPLEX128: {
res = DoCompute1<std::complex<double>>(ctx, nth_batch, i);
break;
}
default: {
KERNEL_LOG_ERROR(
"Tridiagonal-solve op support input tensor type: float、double、complex64、complex128,should not be tensor "
"type [%s]",
data_type_);
return KERNEL_STATUS_PARAM_INVALID;
}
}
} else {
switch (data_type_) {
case DT_FLOAT:
res = DoCompute2<float>(ctx, nth_batch, i);
break;
case DT_DOUBLE:
res = DoCompute2<double>(ctx, nth_batch, i);
break;
case DT_COMPLEX64:
res = DoCompute2<std::complex<float>>(ctx, nth_batch, i);
break;
case DT_COMPLEX128:
res = DoCompute2<std::complex<double>>(ctx, nth_batch, i);
break;
default: {
KERNEL_LOG_ERROR(
"Tridiagonal-solve op support input tensor type: float、double、complex64、complex128,should not be tensor "
"type [%s]",
data_type_);
return KERNEL_STATUS_PARAM_INVALID;
}
}
}
return res;
}
// 当partial_pivoting的值为true时的计算函数
template <typename T>
uint32_t TridiagonalSolveCpuKernel::DoCompute1(CpuKernelContext &ctx, size_t nth_batch, int i) {
// 计算变量的尺寸
int rhs_rank = rhs_tensor_->GetTensorShape()->GetDims();
int diags_rank = diags_tensor_->GetTensorShape()->GetDims();
const int batch = rhs_tensor_->GetTensorShape()->GetDimSize(rhs_rank - 1);
const int n = diags_tensor_->GetTensorShape()->GetDimSize(diags_rank - 1);
// 计算分片后该次运算的起始地址
auto a = static_cast<T *>(diags_tensor_->GetData());
auto b = static_cast<T *>(rhs_tensor_->GetData());
auto value = reinterpret_cast<T *>(output_tensor_->GetData());
if (i == -1) {
a += nth_batch * diags_size;
b += nth_batch * rhs_size;
value += nth_batch * rhs_size;
} else {
a += i * diags_size;
b += i * rhs_size;
value += i * rhs_size;
}
const T zero = 0;
// 用于计算的中间变量
Array<T, Dynamic, 3> u(n, 3);
// 输入superdiags,diags,subdiags
Array<T, Dynamic, 1> superdiag(n);
Array<T, Dynamic, 1> diag(n);
Array<T, Dynamic, 1> subdiag(n);
// 输入rhs
Array<T, Dynamic, Dynamic> rhs(n, batch);
// 计算结果x
Array<T, Dynamic, Dynamic> x(n, batch);
// 将输入数据装载进变量
for (int i = 0; i < n; i++) {
for (int j = 0; j < batch; j++) {
rhs(i, j) = *(b + i * batch + j);
}
}
for (int i = 0; i < n; i++) {
superdiag(i) = *(a + i);
diag(i) = *(a + n + i);
subdiag(i) = *(a + 2 * n + i);
}
// 计算过程
u(0, 0) = diag(0);
u(0, 1) = superdiag(0);
x.row(0) = rhs.row(0);
for (int i = 0; i < n - 1; ++i) {
if (abs(u(i, 0)) >= abs(subdiag(i + 1))) {
// No row interchange.
if (u(i, 0) == zero) {
KERNEL_LOG_ERROR("The first element of diag should not be zero");
return KERNEL_STATUS_PARAM_INVALID;
}
const T factor = subdiag(i + 1) / u(i, 0);
u(i + 1, 0) = diag(i + 1) - factor * u(i, 1);
x.row(i + 1) = rhs.row(i + 1) - factor * x.row(i);
if (i != n - 2) {
u(i + 1, 1) = superdiag(i + 1);
u(i, 2) = 0;
}
} else {
// Interchange rows i and i + 1.
const T factor = u(i, 0) / subdiag(i + 1);
u(i, 0) = subdiag(i + 1);
u(i + 1, 0) = u(i, 1) - factor * diag(i + 1);
u(i, 1) = diag(i + 1);
x.row(i + 1) = x.row(i) - factor * rhs.row(i + 1);
x.row(i) = rhs.row(i + 1);
if (i != n - 2) {
u(i, 2) = superdiag(i + 1);
u(i + 1, 1) = -factor * superdiag(i + 1);
}
}
}
if (u(n - 1, 0) == zero) {
KERNEL_LOG_ERROR("The last element of diag should not be zero");
return KERNEL_STATUS_PARAM_INVALID;
}
// 计算最终结果并且存入相应的输出地址中
x.row(n - 1) /= u(n - 1, 0);
for (int j = 0; j < batch; j++) {
*(value + (n - 1) * batch + j) = x(n - 1, j);
}
x.row(n - 2) = (x.row(n - 2) - u(n - 2, 1) * x.row(n - 1)) / u(n - 2, 0);
for (int j = 0; j < batch; j++) {
*(value + (n - 2) * batch + j) = x(n - 2, j);
}
for (int i = n - 3; i >= 0; --i) {
x.row(i) = (x.row(i) - u(i, 1) * x.row(i + 1) - u(i, 2) * x.row(i + 2)) / u(i, 0);
for (int j = 0; j < batch; j++) {
*(value + i * batch + j) = x(i, j);
}
}
// KERNEL_LOG_INFO("TridiagonalSolveCpuKernel::DoCompute end! ");
return KERNEL_STATUS_OK;
}
// 当partial_pivoting的值为false时的计算函数
template <typename T>
uint32_t TridiagonalSolveCpuKernel::DoCompute2(CpuKernelContext &ctx, size_t nth_batch, int i) {
// 计算变量的尺寸
int rhs_rank = rhs_tensor_->GetTensorShape()->GetDims();
int diags_rank = diags_tensor_->GetTensorShape()->GetDims();
const int batch = rhs_tensor_->GetTensorShape()->GetDimSize(rhs_rank - 1);
const int n = diags_tensor_->GetTensorShape()->GetDimSize(diags_rank - 1);
// 计算分片后该次运算的起始地址
auto a = static_cast<T *>(diags_tensor_->GetData());
auto b = static_cast<T *>(rhs_tensor_->GetData());
auto value = reinterpret_cast<T *>(output_tensor_->GetData());
if (i == -1) {
a += nth_batch * diags_size;
b += nth_batch * rhs_size;
value += nth_batch * rhs_size;
} else {
a += i * diags_size;
b += i * rhs_size;
value += i * rhs_size;
}
// 用于计算的中间变量
Array<T, Dynamic, 3> u(n, 3);
// 输入superdiags,diags,subdiags
Array<T, Dynamic, 1> superdiag(n);
Array<T, Dynamic, 1> diag(n);
Array<T, Dynamic, 1> subdiag(n);
// 输入rhs
Array<T, Dynamic, Dynamic> rhs(n, batch);
// 计算结果x
Array<T, Dynamic, Dynamic> x(n, batch);
const T zero = 0;
// 计算过程
for (int i = 0; i < n; i++) {
for (int j = 0; j < batch; j++) {
rhs(i, j) = *(b + i * batch + j);
}
}
for (int i = 0; i < n; i++) {
superdiag(i) = *(a + i);
diag(i) = *(a + n + i);
subdiag(i) = *(a + 2 * n + i);
}
if (diag(0) == zero) {
KERNEL_LOG_ERROR("The first element of diag should not be zero");
return KERNEL_STATUS_PARAM_INVALID;
}
u(0) = superdiag(0) / diag(0);
x.row(0) = rhs.row(0) / diag(0);
for (int i = 1; i < n; ++i) {
auto denom = diag(i) - subdiag(i) * u(i - 1);
if (denom == zero) {
KERNEL_LOG_ERROR("The diag should not be zero");
return KERNEL_STATUS_PARAM_INVALID;
}
u(i) = superdiag(i) / denom;
x.row(i) = (rhs.row(i) - subdiag(i) * x.row(i - 1)) / denom;
}
for (int i = n - 2; i >= 0; --i) {
x.row(i) -= u(i) * x.row(i + 1);
}
// 计算最终结果并且存入相应的输出地址中
for (int i = 0; i < n; i++) {
for (int j = 0; j < batch; j++) {
*(value + i * batch + j) = x(i, j);
}
}
KERNEL_LOG_INFO("TridiagonalSolveCpuKernel::DoCompute end! ");
return KERNEL_STATUS_OK;
}
// 主函数
uint32_t TridiagonalSolveCpuKernel::Compute(CpuKernelContext &ctx) {
res = GetInputAndCheck(ctx);
if (res != KERNEL_STATUS_OK) {
return res;
}
data_size = ctx.Input(0)->NumElements();
matrix_num = ctx.Input(0)->NumElements() / diags_size;
// 判断多线程
if (data_size >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
// 使用CpuKernelUtils::GetCPUNum接口获取AI CPU的核数
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
// 若AI CPU中核数大于数据量以数据量作为max_core_num
if (max_core_num > matrix_num) {
max_core_num = matrix_num;
}
// 多线程的lambda函数
auto shared_tridiagonalsolve = [&](size_t start, size_t end) {
for (size_t nth_batch = start; nth_batch < end; nth_batch++) res = choosedatatype_(ctx, nth_batch, -1);
};
CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shared_tridiagonalsolve);
} else {
// 若数据量小于8K不进行分片使用单AI CPU核进行计算。
for (size_t nth_batch = 0; nth_batch < matrix_num; nth_batch++) res = choosedatatype_(ctx, -1, nth_batch);
}
return res;
}
// 注册该算子实现
REGISTER_CPU_KERNEL(TRIDIAGONALSOLVE, TridiagonalSolveCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,46 @@
#ifndef AICPU_KERNELS_NORMALIZED_TRIDIAGONAL_SOLVE_H
#define AICPU_KERNELS_NORMALIZED_TRIDIAGONAL_SOLVE_H
#include "cpu_ops_kernel.h"
#include <vector>
namespace aicpu {
class TridiagonalSolveCpuKernel : public CpuKernel {
public:
TridiagonalSolveCpuKernel() = default;
~TridiagonalSolveCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
Tensor *diags_tensor_ = nullptr;
Tensor *rhs_tensor_ = nullptr;
AttrValue *partial_pivoting = nullptr;
Tensor *output_tensor_ = nullptr;
size_t matrix_num;
int64_t data_size;
uint32_t res;
int32_t diags_rank;
int32_t rhs_rank;
int32_t diags_size;
int32_t rhs_size;
std::shared_ptr<TensorShape> diags_shape;
std::shared_ptr<TensorShape> rhs_shape;
std::vector<int64_t> diags_dimsize;
std::vector<int64_t> rhs_dimsize;
DataType diags_type_ = DT_DOUBLE;
DataType rhs_type_ = DT_DOUBLE;
DataType data_type_ = DT_DOUBLE;
int DimSize0 = 0;
uint32_t GetInputAndCheck(CpuKernelContext &ctx);
uint32_t choosedatatype_(CpuKernelContext &ctx, size_t nth_batch, int i);
template <typename T>
uint32_t DoCompute1(CpuKernelContext &ctx, size_t nth_batch, int i);
template <typename T>
uint32_t DoCompute2(CpuKernelContext &ctx, size_t nth_batch, int i);
}; // namespace aicpu
#endif
}

View File

@ -0,0 +1,132 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "truncated_normal.h"
#include <cmath>
#include <ctime>
#include <iostream>
#include <random>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
#include "utils/eigen_tensor.h"
namespace {
const uint32_t kInputNum = 1;
const uint32_t kOutputNum = 1;
const uint32_t kInputDims = 1;
const uint32_t kInputSizes = 2;
const char *kTruncatedNormal = "TruncatedNormal";
} // namespace
namespace aicpu {
template <typename T>
uint32_t TruncatedNormalCpuKernel::DoCompute(CpuKernelContext &ctx) {
Tensor *input = ctx.Input(0);
Tensor *output = ctx.Output(0);
auto output_nums = output->NumElements();
AttrValue *seed_ptr = ctx.GetAttr("seed");
auto seed_base1 = (seed_ptr == nullptr) ? 0 : (seed_ptr->GetInt());
AttrValue *seed2_ptr = ctx.GetAttr("seed2");
auto seed_base2 = (seed2_ptr == nullptr) ? 0 : (seed2_ptr->GetInt());
auto output_type = output->GetDataType();
auto input_data_nums = input->NumElements();
auto input_data = reinterpret_cast<T *>(input->GetData());
std::vector<int64_t> out_put_dims;
for (auto i = 0; i < input_data_nums; ++i) {
if (*(input_data + i) <= 0) {
KERNEL_LOG_ERROR("Shape elements must be > 0.");
return KERNEL_STATUS_PARAM_INVALID;
}
out_put_dims.push_back(input_data[i]);
}
std::random_device rd;
size_t seedc = seed_base2 != 0 ? seed_base2 : (seed_base1 != 0 ? seed_base1 : rd());
std::default_random_engine final_seed(seedc);
if (output_type == DT_FLOAT16) {
auto output_data = reinterpret_cast<Eigen::half *>(output->GetData());
std::normal_distribution<float> dis(0, 1);
for (int j = 0; j < output_nums;) {
auto data = dis(final_seed);
if (data >= -2 && data <= 2) {
*(output_data + j) = static_cast<Eigen::half>(data);
++j;
}
}
} else if (output_type == DT_FLOAT) {
auto output_data = reinterpret_cast<float_t *>(output->GetData());
std::normal_distribution<float> dis(0, 1);
for (int j = 0; j < output_nums;) {
auto data = dis(final_seed);
if (data >= -2 && data <= 2) {
*(output_data + j) = data;
++j;
}
}
} else {
auto output_data = reinterpret_cast<double_t *>(output->GetData());
std::normal_distribution<double> dis(0, 1);
for (int j = 0; j < output_nums;) {
auto data = dis(final_seed);
if (data >= -2 && data <= 2) {
*(output_data + j) = data;
++j;
}
}
}
output->GetTensorShape()->SetDimSizes(out_put_dims);
return KERNEL_STATUS_OK;
}
uint32_t TruncatedNormalCpuKernel::DataAndTypeCheck(CpuKernelContext &ctx) {
Tensor *input = ctx.Input(0);
Tensor *output = ctx.Output(0);
auto input_data_nums = input->NumElements();
KERNEL_CHECK_FALSE((input_data_nums >= kInputSizes), KERNEL_STATUS_PARAM_INVALID, "Input data elements must >= 2.");
KERNEL_CHECK_FALSE((input->GetTensorShape()->GetDimSizes().size() == kInputDims), KERNEL_STATUS_PARAM_INVALID,
"Input tensor must be a 1-D tensor.");
auto input_datatype = input->GetDataType();
auto output_datatype = output->GetDataType();
KERNEL_CHECK_FALSE((input_datatype == DT_INT32 || input_datatype == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
"Input type must be int64 or int32.");
KERNEL_CHECK_FALSE((output_datatype == DT_FLOAT16 || output_datatype == DT_FLOAT || output_datatype == DT_DOUBLE),
KERNEL_STATUS_PARAM_INVALID, "Out put type must be one of float16, float32 or double.");
return KERNEL_STATUS_OK;
}
uint32_t TruncatedNormalCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check params failed.", kTruncatedNormal);
KERNEL_HANDLE_ERROR(DataAndTypeCheck(ctx), " TruncatedNormal input elements value check failed.");
auto input_datatype = ctx.Input(0)->GetDataType();
uint32_t ret;
switch (input_datatype) {
case DT_INT32:
ret = DoCompute<int32_t>(ctx);
break;
case DT_INT64:
ret = DoCompute<int64_t>(ctx);
break;
default: {
KERNEL_LOG_WARN("TruncatedNormal kernel data type [%s] not support.", DTypeStr(input_datatype).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
KERNEL_CHECK_FALSE((ret == KERNEL_STATUS_OK), ret, "Compute failed.");
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kTruncatedNormal, TruncatedNormalCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,36 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_TRUNCATEDNORMAL_H_
#define AICPU_KERNELS_NORMALIZED_TRUNCATEDNORMAL_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class TruncatedNormalCpuKernel : public CpuKernel {
public:
TruncatedNormalCpuKernel() = default;
~TruncatedNormalCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
template <typename T>
static uint32_t DoCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -36,6 +36,66 @@ const T SubtleMustCopy(const T &x) {
}
} // namespace aicpu
namespace aicpu {
class DimComparator {
public:
DimComparator(const TTypes<int64_t>::Matrix &ix, const std::vector<int64_t> &order, const std::vector<int64_t> &shape)
: ix_(ix), order_(order), dims_(shape.size()) {}
inline bool operator()(const int64_t i, const int64_t j) const {
for (int di = 0; di < dims_; ++di) {
const int64_t d = order_[di];
if (ix_(i, d) < ix_(j, d)) return true;
if (ix_(i, d) > ix_(j, d)) return false;
}
return false;
}
// Compares two indices taken from corresponding index matrices, using the
// standard, row-major (or lexicographic) order. Useful for cases that need
// to distinguish between all three orderings (<, ==, >).
inline static int cmp(const TTypes<int64_t>::ConstMatrix &a_idx, const TTypes<int64_t>::ConstMatrix &b_idx,
const int64_t a_row, const int64_t b_row, const int dims) {
for (int d = 0; d < dims; ++d) {
const int64_t a = a_idx(a_row, d);
const int64_t b = b_idx(b_row, d);
if (a < b) {
return -1;
} else if (a > b) {
return 1;
}
}
return 0;
}
protected:
const TTypes<int64_t>::Matrix ix_;
const std::vector<int64_t> order_;
const int64_t dims_;
const std::vector<int64_t> *ix_order_;
};
template <int ORDER_DIM>
class FixedDimComparator : DimComparator {
public:
FixedDimComparator(const TTypes<int64_t>::Matrix &ix, const std::vector<int64_t> &order,
const std::vector<int64_t> &shape)
: DimComparator(ix, order, shape) {}
inline bool operator()(const int64_t i, const int64_t j) const {
bool value = false;
for (int di = 0; di < ORDER_DIM; ++di) {
const int64_t d = order_[di];
if (ix_(i, d) < ix_(j, d)) {
value = true;
break;
}
if (ix_(i, d) > ix_(j, d)) break;
}
return value;
}
};
} // namespace aicpu
namespace aicpu {
class SparseTensor {
public:
@ -58,6 +118,61 @@ class SparseTensor {
*/
uint32_t IndicesValid(CpuKernelContext &ctx) const;
template <typename T>
void Reorder(const std::vector<int64_t> &order) {
int32_t order_size = static_cast<int32_t>(order.size());
if (order_size != dims_) {
KERNEL_LOG_ERROR("Order length must be SparseTensor rank");
}
auto ix_t = ix_->matrix<int64_t>();
auto vals_t = vals_->vec<T>();
std::vector<int64_t> reorder(dims_);
std::iota(reorder.begin(), reorder.end(), 0);
// Sort to get order of indices
switch (order.size()) {
#define CASE_SORT(ORDER_SIZE) \
case (ORDER_SIZE): { \
FixedDimComparator<(ORDER_SIZE)> sorter(ix_t, order, shape_); \
std::sort(reorder.begin(), reorder.end(), sorter); \
break; \
}
CASE_SORT(0);
CASE_SORT(1);
CASE_SORT(2);
CASE_SORT(3);
CASE_SORT(4);
CASE_SORT(5);
#undef CASE_SORT
default: {
DimComparator sorter(ix_t, order, shape_);
std::sort(reorder.begin(), reorder.end(), sorter);
}
}
// We have a forward reordering, but what we'll need is a
// permutation (the inverse). This can be calculated with O(1)
// additional
// and O(n) time (INVPERM) but we just do the simple thing here.
std::vector<size_t> permutation(reorder.size());
for (std::size_t n = 0; n < reorder.size(); ++n) {
permutation[reorder[n]] = n;
}
// Update indices & values by converting the permutations to
// a product of transpositions. Iterate over the cycles in the
// permutation, and convert each of those into a product of
// transpositions (swaps):
// https://en.wikipedia.org/wiki/Cyclic_permutation
// This is N swaps, 2*N comparisons.
for (std::size_t n = 0; n + 1 < permutation.size(); ++n) {
while (n != permutation[n]) {
std::size_t r = permutation[n];
std::swap_ranges(&(ix_t(n, 0)), &(ix_t(n + 1, 0)), &(ix_t(r, 0)));
std::swap(vals_t(n), vals_t(r));
std::swap(permutation[n], permutation[r]);
}
}
order_.assign(order.begin(), order.end());
}
/*
* group sparse tensor
* @return GroupIterable