!48016 second half of 0103 aicpu migration without IsInf

Merge pull request !48016 from 李林杰/0118_second_half_0103_aicpu_migration_fix_test_conj
This commit is contained in:
i-robot 2023-01-18 08:23:36 +00:00 committed by Gitee
commit 08aa1515d3
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
36 changed files with 4436 additions and 4 deletions

View File

@ -0,0 +1,228 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "hypot.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kHypot = "Hypot";
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
#define HYPOT_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = HypotCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Hypot kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
template <typename T>
T hypot(T a, T b) {
return std::hypot(a, b);
}
uint32_t HypotCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Hypot check input and output number failed.");
KERNEL_HANDLE_ERROR(HypotParamCheck(ctx), "Hypot check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
HYPOT_COMPUTE_CASE(DT_FLOAT, float_t, ctx)
HYPOT_COMPUTE_CASE(DT_DOUBLE, double_t, ctx)
default:
KERNEL_LOG_ERROR("Hypot kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t HypotCpuKernel::HypotParamCheck(CpuKernelContext &ctx) {
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"HypotCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t HypotCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type;
if (in0_elements_nums == in1_elements_nums) {
type = BcastShapeType::SAME_SHAPE;
} else {
type = (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
}
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_hypot = [&](int64_t start, int64_t end) {
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = start; i < end; ++i) {
*(out + i) = hypot(*(in0 + i), *(in1 + i));
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(out + i) = hypot(*in0, *(in1 + i));
}
break;
case BcastShapeType::Y_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(out + i) = hypot(*(in0 + i), *in1);
}
break;
default:
KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0");
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_hypot),
"Hypot Compute failed.");
} else {
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
*(out + i) = hypot(*(in0 + i), *(in1 + i));
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
*(out + i) = hypot(*in0, *(in1 + i));
}
break;
case BcastShapeType::Y_ONE_ELEMENT:
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
*(out + i) = hypot(*(in0 + i), *in1);
}
break;
default:
KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t HypotCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
T *in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
T *in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
T *out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_hypot = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
*(out + i) = hypot<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0");
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_hypot),
"Hypot Compute failed.");
} else {
for (int64_t i = 0; i < data_num; ++i) {
*(out + i) = hypot<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t HypotCpuKernel::HypotCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (isNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kHypot, HypotCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,43 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_HYPOT_H_
#define AICPU_KERNELS_NORMALIZED_HYPOT_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class HypotCpuKernel : public CpuKernel {
public:
HypotCpuKernel() = default;
~HypotCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t HypotParamCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
uint32_t HypotCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,81 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "identityn.h"
#include <algorithm>
#include <vector>
#include "cpu_types.h"
#include "kernel_log.h"
#include "securec.h"
#include "status.h"
#include "utils/kernel_util.h"
namespace {
const char *kIdentityN = "IdentityN";
} // namespace
namespace aicpu {
uint32_t IdentityNCpuKernel::IdentityNParamCheck(CpuKernelContext &ctx) {
// input size and output size check
uint32_t input_size = ctx.GetInputsSize();
uint32_t output_size = ctx.GetOutputsSize();
KERNEL_CHECK_FALSE((input_size == output_size), KERNEL_STATUS_PARAM_INVALID,
"Input size should equal to Output size.");
KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_size, output_size), "[%s] check params failed.", kIdentityN);
for (uint32_t idx = 0; idx < input_size; ++idx) {
Tensor *in_tensor = ctx.Input(idx);
Tensor *out_tensor = ctx.Output(idx);
// TensorShape check
auto in_shape = in_tensor->GetTensorShape();
auto out_shape = out_tensor->GetTensorShape();
KERNEL_CHECK_FALSE((in_shape->GetDimSizes() == out_shape->GetDimSizes()), KERNEL_STATUS_PARAM_INVALID,
"In tensor shape should equal to out tensor shape.");
// DataType Check
DataType in_type = in_tensor->GetDataType();
DataType out_type = out_tensor->GetDataType();
KERNEL_CHECK_FALSE((in_type == out_type), KERNEL_STATUS_PARAM_INVALID,
"In tensor data type should equal to out tensor data type.");
bool type_support =
std::find(support_data_type.begin(), support_data_type.end(), in_type) != support_data_type.end();
KERNEL_CHECK_FALSE(type_support, KERNEL_STATUS_PARAM_INVALID, "IdentityN kernel data type [%s] not support.",
DTypeStr(in_type).c_str());
}
return KERNEL_STATUS_OK;
}
uint32_t IdentityNCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(IdentityNParamCheck(ctx), "IdentityNCpuKernel check params failed");
uint32_t input_size = ctx.GetInputsSize();
for (uint32_t idx = 0; idx < input_size; ++idx) {
Tensor *in_tensor = ctx.Input(idx);
Tensor *out_tensor = ctx.Output(idx);
auto in_data = in_tensor->GetData();
auto out_data = out_tensor->GetData();
uint64_t in_size = in_tensor->GetDataSize();
uint64_t out_size = out_tensor->GetDataSize();
// memory copy
if (out_data != in_data) {
int cpret = memcpy_s(out_data, out_size, in_data, in_size);
KERNEL_CHECK_FALSE((cpret == EOK), KERNEL_STATUS_INNER_ERROR,
"[%s] memcpy_s to output failed, destMax [%ld], count [%ld].", kIdentityN, out_size, in_size);
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kIdentityN, IdentityNCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,36 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_IDENTITY_N_H_
#define AICPU_KERNELS_NORMALIZED_IDENTITY_N_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class IdentityNCpuKernel : public CpuKernel {
public:
IdentityNCpuKernel() = default;
~IdentityNCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t IdentityNParamCheck(CpuKernelContext &ctx);
const std::vector<DataType> support_data_type = {DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8,
DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE};
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,230 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "index_fill.h"
#include <securec.h>
#include <map>
#include "Eigen/Core"
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kNumInput = 4;
const uint32_t kNumOutput = 1;
const char *kIndexFill = "IndexFill";
// when input data size is more than kParallelDataNum, use Parallel func
const uint32_t kParallelDataNum = 16 * 1024;
const uint32_t kParallelDataNumMid = 128 * 1024;
#define INDEXFILL_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = DoCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("IndexFill kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t IndexFillCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kNumInput, kNumOutput), "IndexFill check input and output number failed.");
// get input Tensors
for (uint32_t i = 0; i < kNumInput; ++i) {
Tensor *tensor = ctx.Input(i);
inputs_.push_back(tensor);
}
// get output Tensors
Tensor *tensor = ctx.Output(0);
outputs_.push_back(tensor);
int32_t value_dim = inputs_[3]->GetTensorShape()->GetDims();
KERNEL_CHECK_FALSE((value_dim == 0), KERNEL_STATUS_INNER_ERROR,
"IndexFill only supports a 0-dimensional value tensor, "
"but got tensor with [%d] dimension(s).",
value_dim)
DataType dim_type = inputs_[1]->GetDataType();
DataType index_type = inputs_[2]->GetDataType();
if (dim_type != DT_INT32) {
KERNEL_LOG_ERROR("IndexFill: Expected dtype int32 for dim.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (index_type != DT_INT32) {
KERNEL_LOG_ERROR("IndexFill: Expected dtype int32 for index.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
void IndexFillCpuKernel::SpecialCompute(int64_t start, int64_t end, const int32_t *input_dim,
std::map<int32_t, bool> &index_dict) {
auto *input_x = reinterpret_cast<T *>(inputs_[0]->GetData());
auto *input_value = reinterpret_cast<T *>(inputs_[3]->GetData());
auto *output_y = reinterpret_cast<T *>(outputs_[0]->GetData());
int32_t x_dim_nums = inputs_[0]->GetTensorShape()->GetDims();
auto x_dims = inputs_[0]->GetTensorShape()->GetDimSizes();
int32_t dim_flag;
if (x_dim_nums != 0) {
dim_flag = *input_dim % x_dim_nums + 1;
} else {
dim_flag = 0;
}
int32_t remain_dims = 1;
if (dim_flag == x_dim_nums) {
if (dim_flag != 0) {
remain_dims = x_dims[*input_dim];
}
for (int64_t i = start; i < end; i++) {
int32_t index_flag = i % remain_dims;
std::map<int32_t, bool>::iterator f = index_dict.find(index_flag);
if (f != index_dict.end()) {
output_y[i] = *input_value;
} else {
output_y[i] = input_x[i];
}
}
} else {
for (int32_t i = *input_dim + 1; i < x_dim_nums; i++) {
remain_dims *= x_dims[i];
}
for (int64_t i = start; i < end; i++) {
int32_t index_flag = (i / remain_dims) % x_dims[*input_dim];
std::map<int32_t, bool>::iterator f = index_dict.find(index_flag);
if (f != index_dict.end()) {
output_y[i] = *input_value;
} else {
output_y[i] = input_x[i];
}
}
}
}
template <typename T>
uint32_t IndexFillCpuKernel::DoCompute(CpuKernelContext &ctx) {
int32_t *input_1 = reinterpret_cast<int32_t *>(inputs_[1]->GetData());
int32_t *input_2 = reinterpret_cast<int32_t *>(inputs_[2]->GetData());
int32_t x_dim_nums = inputs_[0]->GetTensorShape()->GetDims();
int32_t dim_nums = inputs_[1]->GetTensorShape()->GetDims();
int32_t index_dim_nums = inputs_[2]->GetTensorShape()->GetDims();
auto x_dims = inputs_[0]->GetTensorShape()->GetDimSizes();
uint32_t data_num = outputs_[0]->NumElements();
int64_t index_num = inputs_[2]->GetTensorShape()->NumElements();
KERNEL_CHECK_FALSE(dim_nums == 0, KERNEL_STATUS_PARAM_INVALID, "Dim has to be a scalar.")
KERNEL_CHECK_FALSE(index_dim_nums <= 1, KERNEL_STATUS_PARAM_INVALID, "Index has to be a vector/scalar.")
int32_t cur_dim = *input_1;
if (*input_1 < 0) {
*input_1 = *input_1 + x_dim_nums;
}
std::map<int32_t, bool> index_dict;
if (x_dim_nums == 0) {
for (int32_t i = 0; i < index_num; i++) {
if (input_2[i] < -1 || input_2[i] > 0) {
KERNEL_LOG_ERROR("Invalid argument 3: out of range.");
return KERNEL_STATUS_PARAM_INVALID;
} else {
index_dict.insert(std::pair<int32_t, bool>(0, true));
}
}
} else if (cur_dim < -x_dim_nums || cur_dim >= x_dim_nums) {
KERNEL_LOG_ERROR(
"Dimension out of range (expected to be in range of "
"[%d, %d], but got %d).",
0 - x_dim_nums, x_dim_nums - 1, cur_dim);
return KERNEL_STATUS_PARAM_INVALID;
} else {
for (int32_t i = 0; i < index_num; i++) {
if (input_2[i] < -x_dims[*input_1] || input_2[i] >= x_dims[*input_1]) {
KERNEL_LOG_ERROR("Invalid argument 3: out of range.");
return KERNEL_STATUS_PARAM_INVALID;
} else {
input_2[i] = (input_2[i] < 0) ? (input_2[i] + x_dims[*input_1]) : input_2[i];
index_dict.insert(std::pair<int32_t, bool>(input_2[i], true));
}
}
}
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
if (max_core_num == 0) {
KERNEL_LOG_ERROR("The number of available CPU cores must be greater than 0!");
}
auto sharder_index_fill = [&](int64_t start, int64_t end) { SpecialCompute<T>(start, end, input_1, index_dict); };
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_index_fill),
"IndexFill Compute failed.");
} else {
SpecialCompute<T>(0, data_num, input_1, index_dict);
}
return KERNEL_STATUS_OK;
}
uint32_t IndexFillCpuKernel::Compute(CpuKernelContext &ctx) {
uint32_t res = GetInputAndCheck(ctx);
if (res != KERNEL_STATUS_OK) {
return res;
}
DataType input_type{ctx.Input(0)->GetDataType()};
switch (input_type) {
INDEXFILL_COMPUTE_CASE(DT_INT8, int8_t, ctx)
INDEXFILL_COMPUTE_CASE(DT_INT16, int16_t, ctx)
INDEXFILL_COMPUTE_CASE(DT_INT32, int32_t, ctx)
INDEXFILL_COMPUTE_CASE(DT_INT64, int64_t, ctx)
INDEXFILL_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
INDEXFILL_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
INDEXFILL_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
INDEXFILL_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
INDEXFILL_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
INDEXFILL_COMPUTE_CASE(DT_FLOAT, float, ctx)
INDEXFILL_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kIndexFill, IndexFillCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_INDEX_FILL_H_
#define AICPU_KERNELS_NORMALIZED_INDEX_FILL_H_
#include <vector>
#include "cpu_ops_kernel.h"
namespace aicpu {
class IndexFillCpuKernel : public CpuKernel {
public:
~IndexFillCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
uint32_t GetInputAndCheck(CpuKernelContext &ctx);
template <typename T>
void SpecialCompute(int64_t start, int64_t end, const int32_t *input_dim, std::map<int32_t, bool> &index_dict);
std::vector<Tensor *> inputs_;
std::vector<Tensor *> outputs_;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,185 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kldiv.h"
#include <iostream>
#include <unsupported/Eigen/CXX11/Tensor>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/kernel_util.h"
namespace {
const std::uint32_t kKLDivInputNum{2};
const std::uint32_t kKLDivOutputNum{1};
const std::int64_t ParallelNum{4096};
const char *kKLDiv{"KLDiv"};
} // namespace
namespace aicpu {
namespace detail {
template <typename T>
inline std::uint32_t ComputeKLDivKernel(const CpuKernelContext &ctx) {
const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
auto input = static_cast<T *>(ctx.Input(0)->GetData());
auto target = static_cast<T *>(ctx.Input(1)->GetData());
auto output = static_cast<T *>(ctx.Output(0)->GetData());
std::int64_t total = ctx.Input(0)->NumElements();
std::size_t data_size = ctx.Input(0)->GetDataSize();
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
std::string reduction = ctx.GetAttr("reduction")->GetString();
if (reduction != "sum" && reduction != "batchmean" && reduction != "none" && reduction != "mean") {
KERNEL_LOG_ERROR("%s is not a valid value for reduction", reduction.c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
bool parallel_flag = false;
if (data_size > ParallelNum * sizeof(T)) {
parallel_flag = true;
}
if (cores == 0) {
return KERNEL_STATUS_INNER_ERROR;
}
T *tmp_array = nullptr;
if (reduction == "none") {
tmp_array = output;
} else {
tmp_array = new T[total];
}
if (parallel_flag) {
std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
std::int64_t length = end - begin;
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input + begin, length, 1);
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target + begin, length, 1);
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_reduce(tmp_array + begin, length, 1);
T constant_zero{0};
array_reduce = array_target * (Eigen::log(array_target) - array_input);
for (std::int64_t idx = 0; idx < length; ++idx) {
if (!(target[begin + idx] > constant_zero)) {
array_reduce(idx) = constant_zero;
}
}
});
} else {
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input, total, 1);
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target, total, 1);
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_reduce(tmp_array, total, 1);
array_reduce = array_target * (Eigen::log(array_target) - array_input);
T constant_zero{0};
for (uint32_t idx = 0; idx < total; ++idx) {
if (!(target[idx] > constant_zero)) {
array_reduce(idx) = constant_zero;
}
}
}
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > reduce(tmp_array, total, 1);
if (reduction == "sum") {
output[0] = reduce.sum();
} else if (reduction == "batchmean") {
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
output[0] = reduce.sum() / T(input_dims[0]);
} else if (reduction == "mean") {
output[0] = reduce.mean();
}
if (reduction != "none") {
delete[] tmp_array;
}
return KERNEL_STATUS_OK;
}
template <typename T>
inline std::uint32_t ComputeKLDiv(const CpuKernelContext &ctx) {
uint32_t result = ComputeKLDivKernel<T>(ctx);
if (result != 0) {
KERNEL_LOG_ERROR("KLDiv compute failed.");
}
return result;
}
inline std::uint32_t KLDivExtraCheck(const CpuKernelContext &ctx) {
if (ctx.Input(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get input x data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(1)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get input target data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Output(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get output y data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataSize() != ctx.Input(1)->GetDataSize()) {
KERNEL_LOG_ERROR(
"The data size of the input [%llu] need be the same as the target "
"[%llu].",
ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize());
return KERNEL_STATUS_PARAM_INVALID;
}
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> target_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
if (input_dims.size() != target_dims.size()) {
KERNEL_LOG_ERROR(
"The data dim size of the input x [%llu] need be the same as the "
"target "
"[%llu].",
input_dims.size(), target_dims.size());
return KERNEL_STATUS_PARAM_INVALID;
}
for (size_t index = 0; index < input_dims.size(); index++) {
if (input_dims[index] != target_dims[index]) {
KERNEL_LOG_ERROR("The data dim of the input x need be the same as the target.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
std::uint32_t KLDivCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
return NormalCheck(ctx, kKLDivInputNum, kKLDivOutputNum, {"reduction"}) ? KERNEL_STATUS_PARAM_INVALID
: KLDivExtraCheck(ctx);
}
// DT_FLOAT16, DT_FLOAT, DT_DOUBLE
std::uint32_t KLDivCompute(const CpuKernelContext &ctx) {
DataType input_type{ctx.Input(0)->GetDataType()};
switch (input_type) {
case DT_FLOAT16:
return ComputeKLDiv<Eigen::half>(ctx);
case DT_FLOAT:
return ComputeKLDiv<std::float_t>(ctx);
case DT_DOUBLE:
return ComputeKLDiv<std::double_t>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
} // namespace detail
std::uint32_t KLDivCpuKernel::Compute(CpuKernelContext &ctx) {
return detail::KLDivCheck(ctx, kKLDivInputNum, kKLDivOutputNum) ? KERNEL_STATUS_PARAM_INVALID
: detail::KLDivCompute(ctx);
}
REGISTER_CPU_KERNEL(kKLDiv, KLDivCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,27 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_KLDIV_H_
#define AICPU_KERNELS_NORMALIZED_KLDIV_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class KLDivCpuKernel final : public CpuKernel {
virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,226 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kldivlossgrad.h"
#include <complex>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kKlDivLossGrad = "KlDivLossGrad";
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 3;
const uint32_t kGradIndex = 0;
const uint32_t kInputIndex = 1;
const uint32_t kTargetIndex = 2;
const std::string AttrReduction = "reduction";
const std::string AttrLog = "log_target";
const int64_t DataDefaultParallelNum = 16384;
} // namespace
namespace aicpu {
template <typename T>
void KlDivLossGradOp(Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &target,
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &grad,
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &output, std::int64_t &len, bool &log_target,
std::string &reduction) {
T constant_zero{0};
if (log_target) {
output = -Eigen::exp(target) * grad;
return;
}
if (reduction == "none") {
for (uint32_t idx = 0; idx < len; ++idx) {
if (target(idx) > constant_zero) {
output(idx) = -target(idx) * grad(idx);
}
}
} else {
for (uint32_t idx = 0; idx < len; ++idx) {
if (target(idx) > constant_zero) {
output(idx) = -target(idx) * grad(0);
}
}
}
return;
}
std::uint32_t KlDivLossGradExtraCheck(CpuKernelContext &ctx) {
Tensor *grad = ctx.Input(0);
Tensor *input = ctx.Input(1);
Tensor *target = ctx.Input(2);
Tensor *output = ctx.Output(0);
if (grad->GetDataSize() == 0) {
KERNEL_LOG_ERROR("[%s] grad is empty tensor.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (input->GetDataSize() == 0) {
KERNEL_LOG_ERROR("[%s] input is empty tensor.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (target->GetDataSize() == 0) {
KERNEL_LOG_ERROR("[%s] target is empty tensor.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (output->GetDataSize() == 0) {
KERNEL_LOG_ERROR("[%s] output is empty tensor.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if ((input->GetDataType() != grad->GetDataType()) || (target->GetDataType() != grad->GetDataType()) ||
(output->GetDataType() != grad->GetDataType())) {
KERNEL_LOG_ERROR(
"The data type of the grad [%s], input [%s], target [%s], output y "
"[%s] must be the same type.",
DTypeStr(grad->GetDataType()).c_str(), DTypeStr(input->GetDataType()).c_str(),
DTypeStr(target->GetDataType()).c_str(), DTypeStr(output->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
std::vector<int64_t> grad_dims = ctx.Input(kGradIndex)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> input_dims = ctx.Input(kInputIndex)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> target_dims = ctx.Input(kTargetIndex)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
std::string reduction = ctx.GetAttr(AttrReduction)->GetString();
if (output_dims != input_dims) {
KERNEL_LOG_ERROR(
"The data shape of the output need be the same as the input. output "
"shape [%s], input shape [%s]",
VectorToString(output_dims).c_str(), VectorToString(input_dims).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (target_dims != input_dims) {
KERNEL_LOG_ERROR(
"The data shape of the target need be the same as the input. target "
"shape [%s], input shape [%s]",
VectorToString(target_dims).c_str(), VectorToString(input_dims).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (reduction == "mean" || reduction == "sum" || reduction == "batchmean") {
if (ctx.Input(0)->NumElements() != 1) {
KERNEL_LOG_ERROR("The data num of the grad [%llu] must be 1", ctx.Input(0)->NumElements());
return KERNEL_STATUS_PARAM_INVALID;
}
} else if (reduction == "none") {
if (input_dims != grad_dims) {
KERNEL_LOG_ERROR(
"The data shape of the grad need be the same as the input. grad "
"shape "
"[%s], input shape [%s]",
VectorToString(grad_dims).c_str(), VectorToString(input_dims).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
uint32_t KlDivLossGradCpuKernel::Compute(CpuKernelContext &ctx) {
if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
return KERNEL_STATUS_PARAM_INVALID;
}
if (KlDivLossGradExtraCheck(ctx) == KERNEL_STATUS_PARAM_INVALID) {
return KERNEL_STATUS_PARAM_INVALID;
}
// choose compute function depend on dataType
auto data_type = static_cast<DataType>(ctx.Input(kFirstInputIndex)->GetDataType());
switch (data_type) {
case DT_FLOAT16:
return KlDivLossGradCompute<Eigen::half>(ctx);
case DT_FLOAT:
return KlDivLossGradCompute<float>(ctx);
case DT_DOUBLE:
return KlDivLossGradCompute<double>(ctx);
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
template <typename T>
uint32_t KlDivLossGradCpuKernel::KlDivLossGradCompute(CpuKernelContext &ctx) {
int64_t grad_total = ctx.Input(0)->NumElements();
int64_t input_total = ctx.Input(1)->NumElements();
int64_t target_total = ctx.Input(2)->NumElements();
int64_t output_y_total = ctx.Output(0)->NumElements();
int64_t total = input_total;
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
T *grad = (T *)(ctx.Input(0)->GetData());
T *input = (T *)(ctx.Input(1)->GetData());
T *target = (T *)(ctx.Input(2)->GetData());
T *output = (T *)(ctx.Output(0)->GetData());
bool parallel_flag = false;
uint64_t data_size = ctx.Input(1)->GetDataSize();
// Determine whether to enable multi-core parallel computing
if (data_size > DataDefaultParallelNum * sizeof(T)) {
parallel_flag = true;
}
// Eigen::Array
bool log_target{false};
if (ctx.GetAttr(AttrLog) != nullptr) {
log_target = ctx.GetAttr(AttrLog)->GetBool();
}
std::string reduction{"mean"};
if (ctx.GetAttr(AttrReduction) != nullptr) {
reduction = ctx.GetAttr(AttrReduction)->GetString();
}
if (cores == 0) {
KERNEL_LOG_ERROR("KlDivLossGrad compute failed.");
return KERNEL_STATUS_INNER_ERROR;
}
if (parallel_flag) {
const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
auto shard_kldivlossgrad = [&](std::int64_t begin, std::int64_t end) {
std::int64_t length = end - begin;
std::int64_t grad_begin{0}, grad_length{grad_total};
if (reduction == "none") {
grad_begin = begin;
grad_length = length;
}
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_grad(grad + grad_begin, grad_length, 1);
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input + begin, length, 1);
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target + begin, length, 1);
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_output(output + begin, length, 1);
T constant_zero{0};
array_output = constant_zero;
KlDivLossGradOp<T>(array_target, array_grad, array_output, length, log_target, reduction);
if (reduction == "mean") {
array_output = array_output / T(output_y_total);
} else if (reduction == "batchmean") {
std::vector<int64_t> input_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
array_output = array_output / T(input_dims[0]);
}
};
KERNEL_HANDLE_ERROR(ParallelFor(ctx, total, per_unit_size, shard_kldivlossgrad), "KlDivLossGrad Compute failed.");
} else {
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_grad(grad, grad_total, 1);
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input, input_total, 1);
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target, target_total, 1);
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_output(output, output_y_total, 1);
T constant_zero{0};
array_output = constant_zero;
KlDivLossGradOp<T>(array_target, array_grad, array_output, output_y_total, log_target, reduction);
if (reduction == "mean") {
array_output = array_output / T(output_y_total);
} else if (reduction == "batchmean") {
std::vector<int64_t> input_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
array_output = array_output / T(input_dims[0]);
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kKlDivLossGrad, KlDivLossGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,42 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
#define AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
#define EIGEN_USE_THREADS
#define EIGEN_USE_SIMPLE_THREAD_POOL
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
namespace aicpu {
class KlDivLossGradCpuKernel : public CpuKernel {
public:
KlDivLossGradCpuKernel() = default;
~KlDivLossGradCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
/**
* @brief compute for all types
* @param ctx cpu kernel context
* @return status if success
*/
template <typename T>
uint32_t KlDivLossGradCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H

View File

@ -0,0 +1,173 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lcm.h"
#include <cmath>
#include <set>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kLcmOutputNum = 1;
const uint32_t kLcmInputNum = 2;
const char *kLcm = "Lcm";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int32_t kInput_32_32 = 3;
const int32_t kInput_32_64 = 2;
const int32_t kInput_64_32 = 1;
const int32_t kInput_64_64 = 0;
} // namespace
namespace aicpu {
// Simple recursive gcd.
template <class T>
T elewise_gcd(T a, T b) {
if (b == 0) {
return a;
}
return elewise_gcd(b, a % b);
}
// Simple lcm.
template <typename T>
T elewise_lcm(T a, T b) {
T gcd_tmp = elewise_gcd<T>(a, b);
if (gcd_tmp == 0) {
return static_cast<T>(0);
}
return std::abs(a / gcd_tmp * b);
}
uint32_t LcmIOTypeCheck(CpuKernelContext &ctx, int32_t &dual_types) {
Tensor *x1 = ctx.Input(kFirstInputIndex);
Tensor *x2 = ctx.Input(kSecondInputIndex);
Tensor *y = ctx.Output(kFirstOutputIndex);
const std::set<DataType> supported_types{DT_INT32, DT_INT64};
auto x1_type = x1->GetDataType();
auto x2_type = x2->GetDataType();
auto y_type = y->GetDataType();
KERNEL_CHECK_FALSE(supported_types.count(x1_type) != 0, KERNEL_STATUS_PARAM_INVALID,
"[Lcm] input x1 data type [%s] is not supported.", DTypeStr(x1_type).c_str());
KERNEL_CHECK_FALSE(supported_types.count(x2_type) != 0, KERNEL_STATUS_PARAM_INVALID,
"[Lcm] input x2 data type [%s] is not supported.", DTypeStr(x2_type).c_str());
int32_t x1_is_i32 = static_cast<int32_t>(x1_type == DT_INT32) << 1;
int32_t x2_is_i32 = static_cast<int32_t>(x2_type == DT_INT32);
int32_t _dual_types = x1_is_i32 | x2_is_i32;
switch (_dual_types) {
case kInput_64_64:
case kInput_64_32:
case kInput_32_64:
KERNEL_CHECK_FALSE(y_type == DT_INT64, KERNEL_STATUS_PARAM_INVALID,
"[Lcm] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
dual_types = _dual_types;
break;
case kInput_32_32:
KERNEL_CHECK_FALSE(y_type == DT_INT32, KERNEL_STATUS_PARAM_INVALID,
"[Lcm] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
dual_types = _dual_types;
break;
default:
KERNEL_LOG_ERROR("[Lcm] input data type tuple is not supported.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <class T1, class T2, class T3>
uint32_t LcmElewiseCompute(CpuKernelContext &ctx, const T1 *x1_ptr, const T2 *x2_ptr, T3 *y_ptr, Bcast &bcast) {
int64_t data_num = ctx.Output(kFirstOutputIndex)->NumElements();
auto lcm_shard = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
T3 x1_ele_abs = std::abs(static_cast<T3>(x1_ptr[bcast.GetBroadcastXIndex(i)]));
T3 x2_ele_abs = std::abs(static_cast<T3>(x2_ptr[bcast.GetBroadcastYIndex(i)]));
y_ptr[i] = elewise_lcm(x1_ele_abs, x2_ele_abs);
}
};
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
if (max_core_num == 0) {
KERNEL_LOG_ERROR("[Lcm] max_core_num is 0, please check the cpu num.");
return KERNEL_STATUS_PARAM_INVALID;
}
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, lcm_shard);
if (ret != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("[Lcm] Lcm Compute failed.");
return ret;
}
} else {
lcm_shard(0, data_num);
}
return KERNEL_STATUS_OK;
}
template <class T1, class T2, class T3>
uint32_t LcmCompute(CpuKernelContext &ctx) {
Tensor *x1 = ctx.Input(kFirstInputIndex);
Tensor *x2 = ctx.Input(kSecondInputIndex);
Tensor *y = ctx.Output(kFirstOutputIndex);
const T1 *x1_ptr = reinterpret_cast<const T1 *>(x1->GetData());
const T2 *x2_ptr = reinterpret_cast<const T2 *>(x2->GetData());
T3 *y_ptr = reinterpret_cast<T3 *>(y->GetData());
auto x1_shape = x1->GetTensorShape()->GetDimSizes();
auto x2_shape = x2->GetTensorShape()->GetDimSizes();
Bcast bcast(x1_shape, x2_shape);
if (bcast.IsValid()) {
return LcmElewiseCompute<T1, T2, T3>(ctx, x1_ptr, x2_ptr, y_ptr, bcast);
} else {
KERNEL_LOG_ERROR("[Lcm] broadcast failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
uint32_t LcmCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kLcmInputNum, kLcmOutputNum), "[Lcm] check input and output number failed.");
int32_t dual_types = static_cast<int32_t>(-1);
KERNEL_HANDLE_ERROR(LcmIOTypeCheck(ctx, dual_types), "[Lcm] check data type failed.");
switch (dual_types) {
case kInput_64_64:
return LcmCompute<int64_t, int64_t, int64_t>(ctx);
break;
case kInput_64_32:
return LcmCompute<int64_t, int32_t, int64_t>(ctx);
break;
case kInput_32_64:
return LcmCompute<int32_t, int64_t, int64_t>(ctx);
break;
case kInput_32_32:
return LcmCompute<int32_t, int32_t, int32_t>(ctx);
break;
default:
KERNEL_LOG_ERROR("[Lcm] input data type tuple is not supported.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLcm, LcmCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,32 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LCM_H_
#define AICPU_KERNELS_NORMALIZED_LCM_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class LcmCpuKernel : public CpuKernel {
public:
~LcmCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,126 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "logit.h"
#include "Eigen/Core"
#include "Eigen/Dense"
#include "Eigen/LU"
#include "cmath"
#include "cpu_context.h"
#include "cpu_kernel_utils.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 16 * 1024;
const char *kLogit = "Logit";
#define LOGIT_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = LogitCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Logit kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t LogitCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLogit);
DataType data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
LOGIT_COMPUTE_CASE(DT_DOUBLE, double, ctx)
LOGIT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
LOGIT_COMPUTE_CASE(DT_FLOAT, float, ctx)
default:
KERNEL_LOG_ERROR("Logit kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t LogitCpuKernel::LogitCompute(CpuKernelContext &ctx) {
auto input_tensor = ctx.Input(0);
auto output_tensor = ctx.Output(0);
auto input = reinterpret_cast<T *>(input_tensor->GetData());
auto output = reinterpret_cast<T *>(output_tensor->GetData());
AttrValue *attr = ctx.GetAttr("eps");
float eps = -1.0;
if (attr != nullptr) {
eps = attr->GetFloat();
}
auto input_shape = input_tensor->GetTensorShape();
int64_t data_num = input_shape->NumElements();
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U);
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shared_less = [&](size_t start, size_t end) {
T one = T(1);
T up_bound = static_cast<T>(1) - static_cast<T>(eps);
if (eps < 0) {
for (size_t i = start; i < end; i++) {
T x = input[i];
output[i] = log(x / (one - x));
}
} else {
for (size_t i = start; i < end; i++) {
T z;
T x = input[i];
z = x < static_cast<T>(eps) ? static_cast<T>(eps) : (x > up_bound ? up_bound : x);
output[i] = log(z / (one - z));
}
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max core num is 0");
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_less),
"Logit Compute failed.");
} else {
T one = T(1);
T up_bound = static_cast<T>(1) - static_cast<T>(eps);
if (eps < 0) {
for (int64_t i = 0; i < data_num; i++) {
T x = input[i];
output[i] = log(x / (one - x));
}
} else {
for (int64_t i = 0; i < data_num; i++) {
T z;
T x = input[i];
z = x < static_cast<T>(eps) ? static_cast<T>(eps) : (x > up_bound ? up_bound : x);
output[i] = log(z / (one - z));
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLogit, LogitCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,36 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LOGIT_H
#define AICPU_KERNELS_NORMALIZED_LOGIT_H
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class LogitCpuKernel : public CpuKernel {
public:
LogitCpuKernel() = default;
~LogitCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t LogitCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,133 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "logit_grad.h"
#include "Eigen/Core"
#include "Eigen/Dense"
#include "Eigen/LU"
#include "cmath"
#include "cpu_context.h"
#include "cpu_kernel_utils.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 16 * 1024;
const char *kLogitGrad = "LogitGrad";
#define LOGITGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = LogitGradCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("LogitGrad kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t LogitGradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLogitGrad);
DataType data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
LOGITGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
LOGITGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
LOGITGRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("LogitGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t LogitGradCpuKernel::LogitGradCompute(CpuKernelContext &ctx) {
auto input_y_grad_tensor = ctx.Input(0);
auto input_x_tensor = ctx.Input(1);
auto output_x_grad_tensor = ctx.Output(0);
auto input_y_grad = reinterpret_cast<T *>(input_y_grad_tensor->GetData());
auto input_x = reinterpret_cast<T *>(input_x_tensor->GetData());
auto output_x_grad = reinterpret_cast<T *>(output_x_grad_tensor->GetData());
auto input_shape = input_x_tensor->GetTensorShape();
int64_t data_num = input_shape->NumElements();
float eps = -1.0;
AttrValue *attr = ctx.GetAttr("eps");
if (attr != nullptr) {
eps = attr->GetFloat();
}
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U);
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shared_less = [&](size_t start, size_t end) {
T one = T(1);
T zero = T(0);
T up_bound = static_cast<T>(1) - static_cast<T>(eps);
if (eps < 0) {
for (size_t i = start; i < end; i++) {
T y_grad = input_y_grad[i];
T x = input_x[i];
output_x_grad[i] = (x < zero || x > one) ? std::numeric_limits<T>::quiet_NaN() : (y_grad / (x * (one - x)));
}
} else {
for (size_t i = start; i < end; i++) {
T y_grad = input_y_grad[i];
T x = input_x[i];
output_x_grad[i] =
static_cast<float>(x) < static_cast<float>(eps) || static_cast<float>(x) > static_cast<float>(up_bound)
? zero
: (y_grad / (x * (one - x)));
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_less),
"LogitGrad Compute failed.");
} else {
T one = T(1);
T zero = T(0);
T up_bound = static_cast<T>(1) - static_cast<T>(eps);
if (eps < 0) {
for (int64_t i = 0; i < data_num; i++) {
T y_grad = input_y_grad[i];
T x = input_x[i];
output_x_grad[i] = (x < zero || x > one) ? std::numeric_limits<T>::quiet_NaN() : (y_grad / (x * (one - x)));
}
} else {
for (int64_t i = 0; i < data_num; i++) {
T y_grad = input_y_grad[i];
T x = input_x[i];
output_x_grad[i] =
static_cast<float>(x) < static_cast<float>(eps) || static_cast<float>(x) > static_cast<float>(up_bound)
? zero
: (y_grad / (x * (one - x)));
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLogitGrad, LogitGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,36 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LOGIT_GRAD_H
#define AICPU_KERNELS_NORMALIZED_LOGIT_GRAD_H
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class LogitGradCpuKernel : public CpuKernel {
public:
LogitGradCpuKernel() = default;
~LogitGradCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t LogitGradCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,153 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lower_bound.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
const char *kLowerBound = "LowerBound";
#define LOWERBOUND_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = LowerBoundCompute<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("LowerBound kernel compute failed."); \
return result; \
} \
break; \
}
#define LOWERBOUND_COMPUTE_CASE_ALL(TYPE, CTX) \
LOWERBOUND_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
LOWERBOUND_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
LOWERBOUND_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
LOWERBOUND_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
LOWERBOUND_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
LOWERBOUND_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
LOWERBOUND_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
LOWERBOUND_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
LOWERBOUND_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
} // namespace
namespace aicpu {
uint32_t LowerBoundCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LowerBound check input and output number failed.");
Tensor *sorted_x_data = ctx.Input(0);
Tensor *values_data = ctx.Input(1);
Tensor *output_data = ctx.Output(0);
auto output_type = output_data->GetDataType();
auto sorted_x_type = sorted_x_data->GetDataType();
auto values_type = values_data->GetDataType();
if (sorted_x_type != values_type) {
KERNEL_LOG_ERROR("Input[0] data type[%s] must be same with Input[1] data type[%s]", DTypeStr(sorted_x_type).c_str(),
DTypeStr(values_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
switch (output_type) {
case DT_INT32:
switch (sorted_x_type) {
LOWERBOUND_COMPUTE_CASE_ALL(int32_t, ctx)
default:
KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
case DT_INT64:
switch (sorted_x_type) {
LOWERBOUND_COMPUTE_CASE_ALL(int64_t, ctx)
default:
KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
default:
KERNEL_LOG_ERROR("Output data type[%s] not supported.", DTypeStr(output_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t LowerBoundCpuKernel::LowerBoundCompute(CpuKernelContext &ctx) {
Tensor *sorted_x_data = ctx.Input(0);
auto sorted_x_data_addr = reinterpret_cast<T1 *>(sorted_x_data->GetData());
auto sorted_x_data_shape = sorted_x_data->GetTensorShape();
std::vector<int64_t> sorted_x_data_shape_dims = sorted_x_data_shape->GetDimSizes();
Tensor *values_data = ctx.Input(1);
auto values_data_addr = reinterpret_cast<T1 *>(values_data->GetData());
auto values_data_shape = values_data->GetTensorShape();
int64_t values_data_num = values_data_shape->NumElements();
std::vector<int64_t> values_data_shape_dims = values_data_shape->GetDimSizes();
Tensor *output_data = ctx.Output(0);
auto output_data_addr = reinterpret_cast<T2 *>(output_data->GetData());
if (sorted_x_data_shape_dims[0] != values_data_shape_dims[0]) {
KERNEL_LOG_ERROR("The number of rows of Input[0]:([%d]) should be consistent with that of Input[1]:([%d]).",
sorted_x_data_shape_dims[0], values_data_shape_dims[0]);
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t sorted_x_data_column = sorted_x_data_shape_dims[1];
int64_t values_data_column = values_data_shape_dims[1];
if (values_data_num < 1024) {
for (int64_t i = 0; i < values_data_num; i++) {
int64_t seq_row = i / values_data_column;
int64_t low = seq_row * sorted_x_data_column;
int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
int64_t mid;
while (low <= up) {
mid = (low + up) / 2;
if (values_data_addr[i] <= sorted_x_data_addr[mid]) {
up = mid - 1;
} else {
low = mid + 1;
}
}
output_data_addr[i] = low - seq_row * sorted_x_data_column;
}
} else {
uint32_t min_core_num = 1;
int64_t sum_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (sum_core_num > values_data_num) {
sum_core_num = values_data_num;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
int64_t seq_row = i / values_data_column;
int64_t low = seq_row * sorted_x_data_column;
int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
int64_t mid;
while (low <= up) {
mid = (low + up) / 2;
if (values_data_addr[i] <= sorted_x_data_addr[mid]) {
up = mid - 1;
} else {
low = mid + 1;
}
}
output_data_addr[i] = low - seq_row * sorted_x_data_column;
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, values_data_num, values_data_num / sum_core_num, shard_compute),
"LowerBound Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLowerBound, LowerBoundCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LOWERBOUND_H_
#define AICPU_KERNELS_NORMALIZED_LOWERBOUND_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class LowerBoundCpuKernel : public CpuKernel {
public:
LowerBoundCpuKernel() = default;
~LowerBoundCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2>
static uint32_t LowerBoundCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,115 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lstsq.h"
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
#include "utils/eigen_tensor.h"
#include <Eigen/Dense>
#include <Eigen/Cholesky>
#include <iostream>
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kLstsq = "Lstsq";
} // namespace
// namespace aicpu
namespace aicpu {
uint32_t LstsqCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Lstsq check input and output number failed.");
Tensor *input_x0 = ctx.Input(0);
Tensor *input_x1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
auto dims_0 = input_x0->GetTensorShape()->GetDims();
auto dims_1 = input_x1->GetTensorShape()->GetDims();
KERNEL_CHECK_FALSE((dims_0 == 2), KERNEL_STATUS_PARAM_INVALID, "Dimension of input[0] must be 2, but got[%zu].",
dims_0);
KERNEL_CHECK_FALSE(((dims_1 == 2) || (dims_1 == 1)), KERNEL_STATUS_PARAM_INVALID,
"Dimension of input[1] must be 2 or 1, but got[%zu].", dims_1);
auto shape_0 = input_x0->GetTensorShape();
auto shape_1 = input_x1->GetTensorShape();
KERNEL_CHECK_FALSE((shape_0->GetDimSize(0) == shape_1->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
"Lstsq shape_0[0] and shape_1[0] not equal.", shape_0->GetDimSize(0), shape_0->GetDimSize(1));
AttrValue *I2_regularizer = ctx.GetAttr("l2_regularizer");
AttrValue *fast = ctx.GetAttr("fast");
KERNEL_CHECK_NULLPTR(I2_regularizer, KERNEL_STATUS_PARAM_INVALID, "Get l2_regularizer failed.");
KERNEL_CHECK_NULLPTR(fast, KERNEL_STATUS_PARAM_INVALID, "Get fast failed.");
KERNEL_LOG_DEBUG(
"LstsqCpuKernel[%s], inputx0: size[%llu];"
"inputx1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_x0->GetDataSize(), input_x1->GetDataSize(), output->GetDataSize());
DataType data_type1 = ctx.Input(0)->GetDataType();
DataType data_type2 = ctx.Input(1)->GetDataType();
KERNEL_CHECK_FALSE((data_type1 == data_type2), KERNEL_STATUS_PARAM_INVALID,
"Lstsq input_0_dtype must be equal to input_1_dtype.", data_type1, data_type2);
switch (data_type1) {
case DT_FLOAT16:
return LstsqCompute<float, Eigen::half>(ctx);
case DT_FLOAT:
return LstsqCompute<float, float>(ctx);
case DT_DOUBLE:
return LstsqCompute<double, double>(ctx);
default:
KERNEL_LOG_ERROR("Lstsq kernel data type [%u] not support.", DTypeStr(data_type1).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t LstsqCpuKernel::LstsqCompute(CpuKernelContext &ctx) {
Eigen::Index m = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
Eigen::Index n = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
Eigen::Index k = 1;
if (ctx.Input(1)->GetTensorShape()->GetDims() == 2) {
k = ctx.Input(1)->GetTensorShape()->GetDimSize(1);
}
typedef Eigen::Matrix<T1, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartixXd;
MartixXd A(m, n);
MartixXd B(m, k);
auto aptr = reinterpret_cast<T2 *>(ctx.Input(0)->GetData());
auto bptr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
for (int i = 0; i < m * n; i++) {
*(A.data() + i) = static_cast<T1>(*(aptr + i));
}
for (int i = 0; i < m * k; i++) {
*(B.data() + i) = static_cast<T1>(*(bptr + i));
}
MartixXd result(n, k);
if (m >= n) {
result = A.colPivHouseholderQr().solve(B);
} else {
MartixXd A_Transpose = A.transpose();
MartixXd temp = A * A_Transpose;
MartixXd tempI = temp.inverse();
MartixXd x = A_Transpose * tempI;
MartixXd output = x * B;
result = output;
}
auto output_addr = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
for (int i = 0; i < n * k; i++) {
*(output_addr + i) = static_cast<T2>(*(result.data() + i));
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLstsq, LstsqCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LSTSQ_H_
#define AICPU_KERNELS_NORMALIZED_LSTSQ_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class LstsqCpuKernel : public CpuKernel {
public:
LstsqCpuKernel() = default;
~LstsqCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2>
static uint32_t LstsqCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,185 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lu_solve.h"
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
#include "utils/eigen_tensor.h"
#include <Eigen/Dense>
#include <iostream>
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 3;
const int64_t kParallelBatchNum1 = 50;
const int64_t kParallelBatchNum4 = 200;
const int64_t kParallelBatchNum8 = 500;
const int64_t kParallelBatchNumx = 1000;
const char *kLuSolve = "LuSolve";
} // namespace
namespace aicpu {
uint32_t LuSolveCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check LuSolve params failed.");
Tensor *input_0 = ctx.Input(0);
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input0 data failed.");
Tensor *input_1 = ctx.Input(1);
KERNEL_CHECK_NULLPTR(input_1->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input1 data failed.");
Tensor *input_2 = ctx.Input(2);
KERNEL_CHECK_NULLPTR(input_2->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input2 data failed.");
Tensor *output = ctx.Output(0);
auto input_0_Shape = input_0->GetTensorShape();
KERNEL_CHECK_NULLPTR(input_0_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_0_Shape failed.")
auto input_1_Shape = input_1->GetTensorShape();
KERNEL_CHECK_NULLPTR(input_1_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_1_Shape failed.")
auto input_2_Shape = input_2->GetTensorShape();
KERNEL_CHECK_NULLPTR(input_2_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_2_Shape failed.")
int32_t b_dims = input_0_Shape->GetDims();
int32_t lu_dims = input_1_Shape->GetDims();
int32_t pivots_dims = input_2_Shape->GetDims();
std::vector<int64_t> b_dims_vector = input_0_Shape->GetDimSizes();
std::vector<int64_t> lu_dims_vector = input_1_Shape->GetDimSizes();
std::vector<int64_t> pivots_dims_vector = input_2_Shape->GetDimSizes();
if (b_dims == lu_dims) {
for (int32_t i = 0; i <= b_dims - 2; i++) {
if (b_dims_vector[i] != lu_dims_vector[i]) {
KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
return KERNEL_STATUS_PARAM_INVALID;
}
}
} else if (lu_dims > b_dims) {
for (int32_t i = 0; i < b_dims - 2; i++) {
if (b_dims_vector[i] != lu_dims_vector[lu_dims - b_dims + i]) {
KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
return KERNEL_STATUS_PARAM_INVALID;
}
}
} else {
for (int32_t i = 0; i < lu_dims - 2; i++) {
if (lu_dims_vector[i] != b_dims_vector[b_dims - lu_dims + i]) {
KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
return KERNEL_STATUS_PARAM_INVALID;
}
}
}
for (int32_t i = 0; i < pivots_dims; i++) {
if (lu_dims_vector[i] != pivots_dims_vector[i]) {
KERNEL_LOG_ERROR("batch dimension of LU_pivots doesn't match batch dimension of LU_data!");
return KERNEL_STATUS_PARAM_INVALID;
}
}
auto data_type = ctx.Input(0)->GetDataType();
KERNEL_LOG_DEBUG(
"LuSolveCpuKernel[%s], input_0: size[%llu], input_1: size[%llu], input_2: size[%llu]"
"output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), input_2->GetDataSize(),
output->GetDataSize());
switch (data_type) {
case DT_FLOAT:
return LuSolveCompute<float, float>(ctx);
case DT_FLOAT16:
return LuSolveCompute<float, Eigen::half>(ctx);
default:
KERNEL_LOG_ERROR("LuSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T, typename T2>
uint32_t LuSolveCpuKernel::LuSolve(CpuKernelContext &ctx, T *b_working_ptr, T *lu_working_ptr,
int32_t *pivots_working_ptr, int64_t b_stride, int64_t a) {
auto output_y = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
auto input_0_Shape = ctx.Input(0)->GetTensorShape();
auto input_1_Shape = ctx.Input(1)->GetTensorShape();
int32_t lu_dims = input_1_Shape->GetDims();
int64_t lu_maxtrix_sizes = input_1_Shape->GetDimSize(lu_dims - 2);
int32_t b_dim = input_0_Shape->GetDims();
int64_t b_m = input_0_Shape->GetDimSize(b_dim - 1);
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
MatrixXd matrix_b = Eigen::Map<MatrixXd>(b_working_ptr, lu_maxtrix_sizes, b_m);
MatrixXd matrix_A = Eigen::Map<MatrixXd>(lu_working_ptr, lu_maxtrix_sizes, lu_maxtrix_sizes);
for (int64_t i = 0; i < input_0_Shape->GetDimSize(b_dim - 2); i++) {
matrix_b.row(i).swap(matrix_b.row(*(pivots_working_ptr + i) - 1));
}
MatrixXd L = matrix_A.template triangularView<Eigen::UnitLower>();
MatrixXd U = matrix_A.template triangularView<Eigen::Upper>();
MatrixXd result = (L * U).lu().solve(matrix_b);
for (int64_t m = 0; m < b_stride; m++) {
*(output_y + a * b_stride + m) = (T2) * (result.data() + m);
}
return KERNEL_STATUS_OK;
}
template <typename T, typename T2>
uint32_t LuSolveCpuKernel::LuSolveCompute(CpuKernelContext &ctx) {
auto input_x0 = reinterpret_cast<T2 *>(ctx.Input(0)->GetData());
auto input_x1 = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
auto input_x2 = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
auto input_0_Shape = ctx.Input(0)->GetTensorShape();
auto input_1_Shape = ctx.Input(1)->GetTensorShape();
auto input_2_Shape = ctx.Input(2)->GetTensorShape();
T *input_0 = new T[input_0_Shape->NumElements()];
T *input_1 = new T[input_1_Shape->NumElements()];
for (int64_t i = 0; i < input_0_Shape->NumElements(); i++) {
*(input_0 + i) = (T) * (input_x0 + i);
}
for (int64_t i = 0; i < input_1_Shape->NumElements(); i++) {
*(input_1 + i) = (T) * (input_x1 + i);
}
int32_t b_dims = input_0_Shape->GetDims();
int32_t lu_dims = input_1_Shape->GetDims();
std::vector<int64_t> b_dims_vector = input_0_Shape->GetDimSizes();
std::vector<int64_t> lu_dims_vector = input_1_Shape->GetDimSizes();
std::vector<int64_t> pivots_dims_vector = input_2_Shape->GetDimSizes();
int64_t b_stride = input_0_Shape->GetDimSize(b_dims - 1) * input_0_Shape->GetDimSize(b_dims - 2);
int64_t lu_stride = input_1_Shape->GetDimSize(lu_dims - 1) * input_1_Shape->GetDimSize(lu_dims - 2);
int64_t pivots_stride = input_1_Shape->GetDimSize(lu_dims - 1);
std::vector<int64_t> b_shape = b_dims_vector;
std::vector<int64_t> lu_shape = lu_dims_vector;
for (size_t i = 0; i < 2; i++) {
b_shape.pop_back();
lu_shape.pop_back();
}
Bcast bcast(b_shape, lu_shape);
int64_t batch_num = ctx.Output(0)->NumElements() / b_stride;
if (batch_num < kParallelBatchNum1) {
for (int64_t i = 0; i < batch_num; i++) {
T *b_working_ptr = &input_0[bcast.GetBroadcastXIndex(i) * b_stride];
T *lu_working_ptr = &input_1[bcast.GetBroadcastYIndex(i) * lu_stride];
int32_t *pivots_working_ptr = &input_x2[bcast.GetBroadcastYIndex(i) * pivots_stride];
LuSolve<T, T2>(ctx, b_working_ptr, lu_working_ptr, pivots_working_ptr, b_stride, i);
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (batch_num < kParallelBatchNumx) max_core_num = 8U;
if (batch_num < kParallelBatchNum8) max_core_num = 4U;
if (batch_num < kParallelBatchNum4) max_core_num = 2U;
std::cout << max_core_num << std::endl;
auto sharder = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
T *b_working_ptr = &input_0[bcast.GetBroadcastXIndex(i) * b_stride];
T *lu_working_ptr = &input_1[bcast.GetBroadcastYIndex(i) * lu_stride];
int32_t *pivots_working_ptr = &input_x2[bcast.GetBroadcastYIndex(i) * pivots_stride];
LuSolve<T, T2>(ctx, b_working_ptr, lu_working_ptr, pivots_working_ptr, b_stride, i);
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_num, batch_num / max_core_num, sharder),
"LuSolve Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLuSolve, LuSolveCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,22 @@
#ifndef AICPU_KERNELS_NORMALIZED_LUSOLVE_H_
#define AICPU_KERNELS_NORMALIZED_LUSOLVE_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class LuSolveCpuKernel : public CpuKernel {
public:
LuSolveCpuKernel() = default;
~LuSolveCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T, typename T2>
static uint32_t LuSolve(CpuKernelContext &ctx, T *b_working_ptr, T *lu_working_ptr, int32_t *pivots_working_ptr,
int64_t b_stride, int64_t i);
template <typename T, typename T2>
static uint32_t LuSolveCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,321 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lu_unpack.h"
#include <string.h>
#include <Eigen/Dense>
#include <algorithm>
#include <iostream>
#include "cpu_context.h"
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "cpu_tensor.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 3;
const uint32_t kInputNum = 2;
const uint32_t kFirstInputIndex = 0;
const uint32_t kSecondInputIndex = 1;
const uint32_t kFirstOutputIndex = 0;
const uint32_t kSecondOutputIndex = 1;
const uint32_t kThirdOutputIndex = 2;
const int32_t kLuDataMinRank = 2;
const int32_t kLuPivotsMinRank = 2;
const int64_t kParallelBatchNum = 70;
const char *kLuUnpack = "LuUnpack";
} // namespace
namespace aicpu {
template <typename T_data, typename T_pivots>
uint32_t LuUnpackCpuKernel::LuUnpack(CpuKernelContext &ctx, T_pivots *Lu_pivots_working_ptr, int64_t matrix_index,
T_data *P_eye) {
int32_t Lu_data_dims = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDims();
int64_t Lu_data_dim1 = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSize(Lu_data_dims - 2);
int64_t Lu_data_dim2 = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSize(Lu_data_dims - 1);
int32_t Lu_pivots_dims = ctx.Input(kSecondInputIndex)->GetTensorShape()->GetDims();
int64_t Lu_pivots_dim = ctx.Input(kSecondInputIndex)->GetTensorShape()->GetDimSize(Lu_pivots_dims - 1);
int64_t matrix_width = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes()[Lu_data_dims - 2];
int64_t matrix_height = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes()[Lu_data_dims - 1];
int64_t pivots_stride = Lu_data_dim1 * Lu_data_dim1;
int64_t L_stride = 0;
int64_t U_stride = 0;
if (Lu_data_dim1 > Lu_data_dim2) {
L_stride = Lu_data_dim1 * Lu_data_dim2;
U_stride = Lu_data_dim2 * Lu_data_dim2;
} else {
L_stride = Lu_data_dim1 * Lu_data_dim1;
U_stride = Lu_data_dim1 * Lu_data_dim2;
}
int64_t matrix_size = matrix_width * matrix_height;
using MatrixMap = Eigen::Map<Eigen::Matrix<T_data, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
MatrixMap input(reinterpret_cast<T_data *>(ctx.Input(kFirstInputIndex)->GetData()) + matrix_index * matrix_size,
matrix_width, matrix_height);
// Triu
if (matrix_width > matrix_height) {
MatrixMap output2(reinterpret_cast<T_data *>(ctx.Output(kThirdOutputIndex)->GetData()) + matrix_index * U_stride,
matrix_height, matrix_height);
T_data *MiddlePtr = new T_data[matrix_size];
MatrixMap MiddleData(MiddlePtr, matrix_width, matrix_height);
MiddleData = input.template triangularView<Eigen::Upper>();
output2 = MiddleData.block(0, 0, matrix_height, matrix_height);
delete[] MiddlePtr;
} else {
MatrixMap output2(reinterpret_cast<T_data *>(ctx.Output(kThirdOutputIndex)->GetData()) + matrix_index * U_stride,
matrix_width, matrix_height);
output2 = input.template triangularView<Eigen::Upper>();
}
// Tril
if (matrix_height > matrix_width) {
MatrixMap output1(reinterpret_cast<T_data *>(ctx.Output(kSecondOutputIndex)->GetData()) + matrix_index * L_stride,
matrix_width, matrix_width);
T_data *MiddlePtr = new T_data[matrix_size];
MatrixMap MiddleData(MiddlePtr, matrix_width, matrix_height);
MiddleData = input.template triangularView<Eigen::UnitLower>();
output1 = MiddleData.block(0, 0, matrix_width, matrix_width);
delete[] MiddlePtr;
} else {
MatrixMap output1(reinterpret_cast<T_data *>(ctx.Output(kSecondOutputIndex)->GetData()) + matrix_index * L_stride,
matrix_width, matrix_height);
output1 = input.template triangularView<Eigen::UnitLower>();
}
// Swap
std::vector<T_pivots> final_order;
final_order.resize(Lu_data_dim1);
for (int i = 0; i < Lu_data_dim1; i++) {
final_order[i] = T_pivots(i);
}
for (T_pivots id = 0; id < Lu_pivots_dim; id++) {
int64_t perm_id = 0;
int64_t perm_pivots_id = 0;
for (int64_t i = 0; i < Lu_data_dim1; i++) {
if (id == final_order[i]) {
perm_id = i;
}
if (!((*(Lu_pivots_working_ptr + id) <= Lu_data_dim1) && (*(Lu_pivots_working_ptr + id) >= 1))) {
return KERNEL_STATUS_PARAM_INVALID;
}
if ((*(Lu_pivots_working_ptr + id) - 1) == final_order[i]) {
perm_pivots_id = i;
}
}
std::swap(final_order[perm_id], final_order[perm_pivots_id]);
}
// Index_select
auto output_y0 = reinterpret_cast<T_data *>(ctx.Output(kFirstOutputIndex)->GetData());
int64_t indices_num = final_order.size();
int64_t inner_size = Lu_data_dim1;
int64_t slice_size = inner_size * sizeof(T_data);
for (int64_t j = 0; j < indices_num; ++j) {
auto params_idx = final_order[j] * inner_size;
auto out_idx = j * inner_size;
memcpy(output_y0 + matrix_index * pivots_stride + out_idx, P_eye + params_idx, slice_size);
}
return KERNEL_STATUS_OK;
}
template <typename T_data, typename T_pivots>
uint32_t LuUnpackCpuKernel::LuUnpackCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(kFirstInputIndex);
Tensor *input1_tensor = ctx.Input(kSecondInputIndex);
auto input_0_Shape = input0_tensor->GetTensorShape();
auto input_1_Shape = input1_tensor->GetTensorShape();
int32_t Lu_data_dims = input_0_Shape->GetDims();
int64_t Lu_data_dim1 = input_0_Shape->GetDimSize(Lu_data_dims - 2);
int64_t Lu_data_dim2 = input_0_Shape->GetDimSize(Lu_data_dims - 1);
int32_t Lu_pivots_dims = input_1_Shape->GetDims();
int64_t Lu_pivots_dim = input_1_Shape->GetDimSize(Lu_pivots_dims - 1);
auto input_dim_size = input_0_Shape->GetDimSizes();
auto input_x1 = reinterpret_cast<T_pivots *>(input1_tensor->GetData());
int32_t block_size = Lu_data_dim1 * Lu_data_dim1;
T_data *P_eye = new T_data[block_size]{};
T_data num = static_cast<T_data>(1);
for (int32_t i = 0; i < Lu_data_dim1; i++) {
*(P_eye + (Lu_data_dim1 + 1) * i) = num;
}
uint32_t check_status = 0;
int64_t Lu_data_stride = Lu_data_dim1 * Lu_data_dim2;
int64_t Lu_pivots_stride = Lu_pivots_dim;
int64_t batch_num = ctx.Input(0)->NumElements() / Lu_data_stride;
if (batch_num < kParallelBatchNum || Lu_data_dims == kLuDataMinRank) {
for (int64_t matrix_index = 0; matrix_index < batch_num; matrix_index++) {
T_pivots *Lu_pivots_working_ptr = input_x1 + matrix_index * Lu_pivots_stride;
check_status = LuUnpack(ctx, Lu_pivots_working_ptr, matrix_index, P_eye);
if (check_status == KERNEL_STATUS_PARAM_INVALID) {
return check_status;
}
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > batch_num) {
max_core_num = batch_num;
}
uint32_t parallel_status = 0;
auto sharder = [&](int64_t start, int64_t end) {
for (int64_t matrix_index = start; matrix_index < end; matrix_index++) {
T_pivots *Lu_pivots_working_ptr = input_x1 + matrix_index * Lu_pivots_stride;
if (LuUnpack(ctx, Lu_pivots_working_ptr, matrix_index, P_eye) == KERNEL_STATUS_OK) {
parallel_status = KERNEL_STATUS_OK;
} else {
parallel_status = KERNEL_STATUS_PARAM_INVALID;
break;
}
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_num, batch_num / max_core_num, sharder),
"LuUnpack Compute failed.");
if (parallel_status != KERNEL_STATUS_OK) {
return KERNEL_STATUS_PARAM_INVALID;
}
}
delete[] P_eye;
return KERNEL_STATUS_OK;
}
void LuUnpackCpuKernel::SetMap() {
calls_[DT_INT8][DT_INT8] = LuUnpackCompute<int8_t, int8_t>;
calls_[DT_INT8][DT_UINT8] = LuUnpackCompute<int8_t, uint8_t>;
calls_[DT_INT8][DT_INT16] = LuUnpackCompute<int8_t, int16_t>;
calls_[DT_INT8][DT_INT32] = LuUnpackCompute<int8_t, int32_t>;
calls_[DT_INT8][DT_INT64] = LuUnpackCompute<int8_t, int64_t>;
calls_[DT_INT16][DT_INT8] = LuUnpackCompute<int16_t, int8_t>;
calls_[DT_INT16][DT_INT16] = LuUnpackCompute<int16_t, int16_t>;
calls_[DT_INT16][DT_INT32] = LuUnpackCompute<int16_t, int32_t>;
calls_[DT_INT16][DT_INT64] = LuUnpackCompute<int16_t, int64_t>;
calls_[DT_INT16][DT_UINT8] = LuUnpackCompute<int16_t, uint8_t>;
calls_[DT_INT32][DT_INT8] = LuUnpackCompute<int32_t, int8_t>;
calls_[DT_INT32][DT_INT16] = LuUnpackCompute<int32_t, int16_t>;
calls_[DT_INT32][DT_INT32] = LuUnpackCompute<int32_t, int32_t>;
calls_[DT_INT32][DT_INT64] = LuUnpackCompute<int32_t, int64_t>;
calls_[DT_INT32][DT_UINT8] = LuUnpackCompute<int32_t, uint8_t>;
calls_[DT_INT64][DT_INT8] = LuUnpackCompute<int64_t, int8_t>;
calls_[DT_INT64][DT_INT16] = LuUnpackCompute<int64_t, int16_t>;
calls_[DT_INT64][DT_INT32] = LuUnpackCompute<int64_t, int32_t>;
calls_[DT_INT64][DT_INT64] = LuUnpackCompute<int64_t, int64_t>;
calls_[DT_INT64][DT_UINT8] = LuUnpackCompute<int64_t, uint8_t>;
calls_[DT_FLOAT16][DT_INT8] = LuUnpackCompute<Eigen::half, int8_t>;
calls_[DT_FLOAT16][DT_INT16] = LuUnpackCompute<Eigen::half, int16_t>;
calls_[DT_FLOAT16][DT_INT32] = LuUnpackCompute<Eigen::half, int32_t>;
calls_[DT_FLOAT16][DT_INT64] = LuUnpackCompute<Eigen::half, int64_t>;
calls_[DT_FLOAT16][DT_UINT8] = LuUnpackCompute<Eigen::half, uint8_t>;
calls_[DT_FLOAT][DT_INT8] = LuUnpackCompute<float, int8_t>;
calls_[DT_FLOAT][DT_INT16] = LuUnpackCompute<float, int16_t>;
calls_[DT_FLOAT][DT_INT32] = LuUnpackCompute<float, int32_t>;
calls_[DT_FLOAT][DT_INT64] = LuUnpackCompute<float, int64_t>;
calls_[DT_FLOAT][DT_UINT8] = LuUnpackCompute<float, uint8_t>;
calls_[DT_DOUBLE][DT_INT8] = LuUnpackCompute<double, int8_t>;
calls_[DT_DOUBLE][DT_INT16] = LuUnpackCompute<double, int16_t>;
calls_[DT_DOUBLE][DT_INT32] = LuUnpackCompute<double, int32_t>;
calls_[DT_DOUBLE][DT_INT64] = LuUnpackCompute<double, int64_t>;
calls_[DT_DOUBLE][DT_UINT8] = LuUnpackCompute<double, uint8_t>;
calls_[DT_UINT8][DT_INT8] = LuUnpackCompute<uint8_t, int8_t>;
calls_[DT_UINT8][DT_INT16] = LuUnpackCompute<uint8_t, int16_t>;
calls_[DT_UINT8][DT_INT32] = LuUnpackCompute<uint8_t, int32_t>;
calls_[DT_UINT8][DT_INT64] = LuUnpackCompute<uint8_t, int64_t>;
calls_[DT_UINT8][DT_UINT8] = LuUnpackCompute<uint8_t, uint8_t>;
}
uint32_t LuUnpackCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LuUnpack check input and output number failed.");
Tensor *LU_data_ = ctx.Input(0);
Tensor *LU_pivots_ = ctx.Input(1);
std::shared_ptr<TensorShape> LU_data_shape = LU_data_->GetTensorShape();
std::shared_ptr<TensorShape> LU_pivots_shape = LU_pivots_->GetTensorShape();
int32_t LU_data_rank = LU_data_shape->GetDims();
if (LU_data_rank < kLuDataMinRank) {
KERNEL_LOG_ERROR(
"The input dim size of LU_data must be at least 2-D, "
"while %d",
LU_data_rank);
return KERNEL_STATUS_PARAM_INVALID;
}
int32_t Lu_data_dims = LU_data_shape->GetDims();
int64_t Lu_data_dim1 = LU_data_shape->GetDimSize(Lu_data_dims - 2);
int64_t Lu_data_dim2 = LU_data_shape->GetDimSize(Lu_data_dims - 1);
int32_t Lu_pivots_dims = LU_pivots_shape->GetDims();
int64_t Lu_pivots_dim = LU_pivots_shape->GetDimSize(Lu_pivots_dims - 1);
if (Lu_pivots_dim != std::min(Lu_data_dim1, Lu_data_dim2)) {
KERNEL_LOG_ERROR(
"The last dimension of LU_pivots must be the same as the minimum value "
"of the last two dimensions of LU_data, "
"but got The last dimension of LU_pivots [%d], the minimum value of "
"the last two dimensions of LU_data: [%d]",
Lu_pivots_dim, std::min(Lu_data_dim1, Lu_data_dim2));
return KERNEL_STATUS_PARAM_INVALID;
}
for (int32_t i = 0; i < Lu_pivots_dims - 1; i++) {
if (LU_data_shape->GetDimSize(i) != LU_pivots_shape->GetDimSize(i)) {
KERNEL_LOG_ERROR(
" LU_data's batch dimensions does not match LU_pivots's batch "
"dimensions.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
DataType LU_data_dtype = static_cast<DataType>(LU_data_->GetDataType());
bool LU_data_dtype_flag = LU_data_dtype != DT_FLOAT16 && LU_data_dtype != DT_FLOAT && LU_data_dtype != DT_DOUBLE &&
LU_data_dtype != DT_INT8 && LU_data_dtype != DT_UINT8 && LU_data_dtype != DT_INT16 &&
LU_data_dtype != DT_INT32 && LU_data_dtype != DT_INT64;
if (LU_data_dtype_flag) {
KERNEL_LOG_ERROR(
"Op LuUnpack first input LU_data_type's data type should be of the "
"follows: "
"DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT16, "
"DT_FLOAT, DT_DOUBLE, "
"but this type is [%s].",
DTypeStr(LU_data_dtype).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
DataType LU_pivots_dtype = static_cast<DataType>(LU_pivots_->GetDataType());
bool LU_pivots_dtype_flag = LU_pivots_dtype != DT_INT8 && LU_pivots_dtype != DT_UINT8 &&
LU_pivots_dtype != DT_INT16 && LU_pivots_dtype != DT_INT32 && LU_pivots_dtype != DT_INT64;
if (LU_pivots_dtype_flag) {
KERNEL_LOG_ERROR(
"Op LuUnpack second input LU_pivots_type's data type should be of the "
"follows: "
"DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, "
"but this type is [%s].",
DTypeStr(LU_pivots_dtype).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
SetMap();
std::vector<DataType> LU_data_type_vec = {DT_INT8, DT_UINT8, DT_INT16, DT_INT32,
DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE};
std::vector<DataType> LU_pivots_type_vec = {DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64};
for (uint64_t i = 0; i < LU_data_type_vec.size(); i++) {
for (uint64_t j = 0; j < LU_pivots_type_vec.size(); j++) {
if (LU_data_dtype == LU_data_type_vec[i] && LU_pivots_dtype == LU_pivots_type_vec[j]) {
KERNEL_HANDLE_ERROR(calls_[LU_data_type_vec[i]][LU_pivots_type_vec[j]](ctx),
"The elements of LU_pivots must be greater than 1 "
"and be less than the size of LU_pivots's last dimension.");
}
}
}
calls_.clear();
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLuUnpack, LuUnpackCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LUUNPACK_H_
#define AICPU_KERNELS_NORMALIZED_LUUNPACK_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class LuUnpackCpuKernel : public CpuKernel {
public:
LuUnpackCpuKernel() = default;
~LuUnpackCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T_data, typename T_pivots>
static uint32_t LuUnpack(CpuKernelContext &ctx, T_pivots *Lu_pivots_working_ptr, int64_t matrix_index, T_data *P_eye);
template <typename T_data, typename T_pivots>
static uint32_t LuUnpackCompute(CpuKernelContext &ctx);
template <typename T_pivots>
static uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
std::map<int, std::map<int, std::function<uint32_t(CpuKernelContext &)>>> calls_;
void SetMap();
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,183 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lu_unpack_grad.h"
#include <iostream>
#include "Eigen/Core"
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "securec.h"
#include "status.h"
#include "utils/broadcast_iterator.h"
#include "utils/kernel_util.h"
namespace {
const char *kLuUnpackGrad = "LuUnpackGrad";
const int64_t kParallelBatchNum = 30;
const uint32_t kInputNum = 3;
const uint32_t kOutputNum = 2;
const uint32_t kInputFirst = 0;
const uint32_t kInputSecond = 1;
const uint32_t kInputThird = 2;
} // namespace
namespace aicpu {
uint32_t LuUnpackGradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Lu Unpack Grad check input and output number failed.");
// choose compute function depend on dataType
auto input_type = static_cast<DataType>(ctx.Input(kInputThird)->GetDataType());
switch (input_type) {
case DT_FLOAT16:
return LuUnpackGradCompute<Eigen::half>(ctx);
case DT_FLOAT:
return LuUnpackGradCompute<float>(ctx);
case DT_DOUBLE:
return LuUnpackGradCompute<double>(ctx);
case DT_INT8:
return LuUnpackGradCompute<int8_t>(ctx);
case DT_INT16:
return LuUnpackGradCompute<int16_t>(ctx);
case DT_INT32:
return LuUnpackGradCompute<int32_t>(ctx);
case DT_INT64:
return LuUnpackGradCompute<int64_t>(ctx);
case DT_UINT8:
return LuUnpackGradCompute<uint8_t>(ctx);
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t LuUnpackGradCpuKernel::TriLU(CpuKernelContext &ctx, Tensor *L_grad_output, Tensor *U_grad_output, int64_t a) {
Tensor *L_grad = NULL;
Tensor *U_grad = NULL;
Tensor *LU_data = NULL;
L_grad = ctx.Input(kInputFirst);
U_grad = ctx.Input(kInputSecond);
LU_data = ctx.Input(kInputThird);
auto LU_data_shape = LU_data->GetTensorShape();
int32_t LU_data_dims = LU_data_shape->GetDims();
int64_t LU_data_height = LU_data_shape->GetDimSize(LU_data_dims - 2);
int64_t LU_data_width = LU_data_shape->GetDimSize(LU_data_dims - 1);
auto LU_dim_min = std::min(LU_data_height, LU_data_width);
auto input_U_shape = U_grad->GetTensorShape();
auto input_U_dim_size = input_U_shape->GetDimSizes();
auto input_U_dims = input_U_shape->GetDims();
int64_t matrix_U_width = input_U_dim_size[input_U_dims - 2];
int64_t matrix_U_height = input_U_dim_size[input_U_dims - 1];
int64_t matrix_U_size = matrix_U_width * matrix_U_height;
auto input_L_shape = L_grad->GetTensorShape();
auto input_L_dim_size = input_L_shape->GetDimSizes();
using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
auto input_L_dims = input_L_shape->GetDims();
int64_t matrix_L_width = input_L_dim_size[input_L_dims - 2];
int64_t matrix_L_height = input_L_dim_size[input_L_dims - 1];
int64_t matrix_L_size = matrix_L_width * matrix_L_height;
int64_t output_stride = LU_data_height * LU_data_width;
MatrixMap input_L(reinterpret_cast<T *>(L_grad->GetData()) + a * matrix_L_size, matrix_L_width, matrix_L_height);
MatrixMap input_U(reinterpret_cast<T *>(U_grad->GetData()) + a * matrix_U_size, matrix_U_width, matrix_U_height);
if (LU_data_width > LU_data_height) {
MatrixMap output_L(reinterpret_cast<T *>(L_grad_output->GetData()) + a * output_stride, LU_data_height,
LU_data_width);
T *MiddlePtr = new T[matrix_L_size];
MatrixMap MiddleData(MiddlePtr, matrix_L_width, matrix_L_height);
MiddleData = input_L.template triangularView<Eigen::StrictlyLower>();
for (auto i = 0; i < LU_data_height; i++) {
for (auto j = 0; j < LU_dim_min; j++) {
output_L(i, j) = MiddleData(i, j);
}
}
delete[] MiddlePtr;
} else {
MatrixMap output_L(reinterpret_cast<T *>(L_grad_output->GetData()) + a * output_stride, LU_data_height,
LU_data_width);
output_L = input_L.template triangularView<Eigen::StrictlyLower>();
}
if (LU_data_height > LU_data_width) {
MatrixMap output_U(reinterpret_cast<T *>(U_grad_output->GetData()) + a * output_stride, LU_data_height,
LU_data_width);
T *MiddlePtr = new T[matrix_U_size];
MatrixMap MiddleData(MiddlePtr, matrix_U_width, matrix_U_height);
MiddleData = input_U.template triangularView<Eigen::Upper>();
for (auto i = 0; i < LU_dim_min; i++) {
for (auto j = i; j < LU_data_width; j++) {
output_U(i, j) = MiddleData(i, j);
}
}
delete[] MiddlePtr;
} else {
MatrixMap output_U(reinterpret_cast<T *>(U_grad_output->GetData()) + a * output_stride, LU_data_height,
LU_data_width);
output_U = input_U.template triangularView<Eigen::Upper>();
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t LuUnpackGradCpuKernel::LuUnpackGradCompute(CpuKernelContext &ctx) {
Tensor *LU_data = NULL;
Tensor *L_grad_output = NULL;
Tensor *U_grad_output = NULL;
LU_data = ctx.Input(kInputThird);
L_grad_output = ctx.Output(0);
U_grad_output = ctx.Output(1);
auto LU_data_shape = LU_data->GetTensorShape();
int32_t LU_data_dims = LU_data_shape->GetDims();
int64_t LU_data_elem_num = LU_data->NumElements();
int64_t LU_data_height = LU_data_shape->GetDimSize(LU_data_dims - 2);
int64_t LU_data_width = LU_data_shape->GetDimSize(LU_data_dims - 1);
int64_t LU_data_stride = LU_data_height * LU_data_width;
int64_t matrix_num = LU_data_elem_num / LU_data_stride;
auto L_grad_output_data = reinterpret_cast<T *>(L_grad_output->GetData());
auto U_grad_output_data = reinterpret_cast<T *>(U_grad_output->GetData());
for (auto i = 0; i < LU_data_elem_num; i++) {
*(L_grad_output_data + i) = static_cast<T>(0);
*(U_grad_output_data + i) = static_cast<T>(0);
}
if (matrix_num < kParallelBatchNum) {
for (auto i = 0; i < matrix_num; i++) {
TriLU<T>(ctx, L_grad_output, U_grad_output, i);
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > matrix_num) {
max_core_num = matrix_num;
}
auto sharder = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
TriLU<T>(ctx, L_grad_output, U_grad_output, i);
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharder),
"LuUnpackGrad Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLuUnpackGrad, LuUnpackGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_LU_UNPACK_GRAD_H_
#define AICPU_KERNELS_LU_UNPACK_GRAD_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class LuUnpackGradCpuKernel : public CpuKernel {
public:
~LuUnpackGradCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
/**
* @brief compute for all types
* @param ctx cpu kernel context
* @return status if success
*/
template <typename T>
uint32_t LuUnpackGradCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t TriLU(CpuKernelContext &ctx, Tensor *L_grad_output, Tensor *U_grad_output, int64_t a);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,179 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "matmul.h"
#include <complex>
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/kernel_util.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "status.h"
using namespace std;
namespace {
const char *kMatmul = "MatMul";
} // namespace
namespace aicpu {
template <typename T>
uint32_t MatMulCpuKernel::AddCompute(CpuKernelContext &ctx, Bcast &bcast) {
auto in2 = reinterpret_cast<T *>(ctx.Input(2)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
for (int64_t i = 0; i < data_num; i++) {
auto input1 = in2 + bcast.GetBroadcastXIndex(i); // i-th value of input0
auto input2 = out + bcast.GetBroadcastYIndex(i); // i-th value of input1
*(out + i) = (*input1) + (*input2);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MatMulCpuKernel::BiasCompute(CpuKernelContext &ctx) {
auto input0_tensor = ctx.Input(0);
auto input2_tensor = ctx.Input(2);
auto input2_shape = input2_tensor->GetTensorShape()->GetDimSizes();
auto output_tensor = ctx.Output(kFirstOutputIndex);
auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE(input2_tensor->GetTensorShape()->GetDims() == 1, KERNEL_STATUS_PARAM_INVALID,
"Input[x3] must be a 1D tensor")
DataType input0_data_type = input0_tensor->GetDataType();
DataType input2_data_type = input2_tensor->GetDataType();
KERNEL_CHECK_FALSE((input0_data_type == input2_data_type), KERNEL_STATUS_PARAM_INVALID,
"Input[x1] data type[%s] and input[x3] data type[%s] must be same",
DTypeStr(input0_data_type).c_str(), DTypeStr(input2_data_type).c_str())
Bcast bcast(input2_shape, output_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return AddCompute<T>(ctx, bcast);
}
template <typename T>
uint32_t MatMulCpuKernel::MatMulCompute(CpuKernelContext &ctx) {
auto input0_tensor = ctx.Input(0);
auto input0_tensor_shape = input0_tensor->GetTensorShape();
KERNEL_CHECK_FALSE((IsMatrix(input0_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
"Input[x1] must be a matrix")
auto input1_tensor = ctx.Input(1);
auto input1_tensor_shape = input1_tensor->GetTensorShape();
KERNEL_CHECK_FALSE((IsMatrix(input1_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
"Input[x2] must be a matrix")
auto transpose_x1 = ctx.GetAttr("transpose_x1")->GetBool();
auto transpose_x2 = ctx.GetAttr("transpose_x2")->GetBool();
KERNEL_LOG_DEBUG(
"%s Attr[transpose_x1] value[%d], "
"Attr[transpose_x2] value[%d].",
kMatmul, transpose_x1, transpose_x2);
int32_t x1_dim = transpose_x1 ? 0 : 1;
int32_t x2_dim = transpose_x2 ? 1 : 0;
KERNEL_CHECK_FALSE((input0_tensor_shape->GetDimSize(x1_dim) == input1_tensor_shape->GetDimSize(x2_dim)),
KERNEL_STATUS_PARAM_INVALID,
"Matrix size incompatible, input[x1] dim[%d] value[%lld], "
"input[x2] dim[%d] value[%lld]",
x1_dim, input0_tensor_shape->GetDimSize(x1_dim), x2_dim, input1_tensor_shape->GetDimSize(x2_dim))
auto input0_shape = input0_tensor_shape->GetDimSizes();
using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
MatrixMap input0(reinterpret_cast<T *>(input0_tensor->GetData()), input0_shape[0], input0_shape[1]);
auto input1_shape = input1_tensor_shape->GetDimSizes();
MatrixMap input1(reinterpret_cast<T *>(input1_tensor->GetData()), input1_shape[0], input1_shape[1]);
auto output_tensor = ctx.Output(kFirstOutputIndex);
auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
MatrixMap output(reinterpret_cast<T *>(output_tensor->GetData()), output_shape[0], output_shape[1]);
if (transpose_x1) {
if (transpose_x2) {
output = input0.transpose() * input1.transpose();
} else {
output = input0.transpose() * input1;
}
} else {
if (transpose_x2) {
output = input0 * input1.transpose();
} else {
output = input0 * input1;
}
}
if (ctx.GetInputsSize() == 3) {
return BiasCompute<T>(ctx);
}
return KERNEL_STATUS_OK;
}
uint32_t MatMulCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
uint32_t input_num = ctx.GetInputsSize();
uint32_t output_num = ctx.GetOutputsSize();
if ((input_num != 2 && input_num != 3) || output_num != 1) {
KERNEL_LOG_ERROR("The number of input or output parameters does not match.");
return KERNEL_STATUS_PARAM_INVALID;
}
auto input0_tensor = ctx.Input(0);
KERNEL_CHECK_NULLPTR(input0_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get input[x1] data failed",
ctx.GetOpType().c_str())
auto input1_tensor = ctx.Input(1);
auto input1_tensor_shape = input1_tensor->GetTensorShape();
KERNEL_CHECK_NULLPTR(input1_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get input[x2] data failed",
ctx.GetOpType().c_str())
DataType input0_data_type = input0_tensor->GetDataType();
DataType input1_data_type = input1_tensor->GetDataType();
KERNEL_CHECK_FALSE((input0_data_type == input1_data_type), KERNEL_STATUS_PARAM_INVALID,
"Input[x1] data type[%s] and input[x2] data type[%s] must be same",
DTypeStr(input0_data_type).c_str(), DTypeStr(input1_data_type).c_str())
KERNEL_LOG_DEBUG("%s op input[x1] data type is [%s].", kMatmul, DTypeStr(input0_data_type).c_str());
uint32_t ret = KERNEL_STATUS_OK;
switch (input0_data_type) {
case DT_FLOAT:
ret = MatMulCompute<float>(ctx);
break;
case DT_DOUBLE:
ret = MatMulCompute<double>(ctx);
break;
case DT_FLOAT16:
ret = MatMulCompute<Eigen::half>(ctx);
break;
case DT_INT32:
ret = MatMulCompute<int32_t>(ctx);
break;
case DT_COMPLEX64:
ret = MatMulCompute<std::complex<float>>(ctx);
break;
case DT_COMPLEX128:
ret = MatMulCompute<std::complex<double>>(ctx);
break;
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(input0_data_type).c_str());
ret = KERNEL_STATUS_PARAM_INVALID;
}
return ret;
}
REGISTER_CPU_KERNEL(kMatmul, MatMulCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,39 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_HOST_MATMUL_H_
#define AICPU_KERNELS_HOST_MATMUL_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class MatMulCpuKernel : public CpuKernel {
public:
MatMulCpuKernel() = default;
~MatMulCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t AddCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
uint32_t BiasCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t MatMulCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,320 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "matrix_exp.h"
#include <array>
#include <complex>
#include <cmath>
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
namespace {
constexpr uint32_t kMatrixExpInputNum = 1;
constexpr uint32_t kMatrixExpOutputNum = 1;
constexpr uint32_t kIndexTwo = 2;
const int64_t paralled_data_size = 8 * 1024;
const char *kMatrixExp = "MatrixExp";
constexpr int total_n_degs = 6;
// Coefficients for computing taylor approximant of order 8.
constexpr double sqrt_177 = 0.1330413469565007072504e+2, x3 = 2. / 3.;
constexpr double x1 = x3 * ((1. + sqrt_177) / 88.), x2 = x3 * ((1. + sqrt_177) / 352.);
constexpr double x4 = (-271. + 29. * sqrt_177) / (315. * x3), x5 = (-11. + 11. * sqrt_177) / (1260. * x3);
constexpr double x6 = (-99. + 11. * sqrt_177) / (5040. * x3), x7 = (89. - sqrt_177) / (5040. * x3);
constexpr double y2 = (857. - 58. * sqrt_177) / 630.;
template <typename T, int ROW, int COL>
using array2d = std::array<std::array<T, COL>, ROW>;
// Coefficients for computing taylor approximant of order 12.
constexpr int num_prods_12 = 4;
array2d<double, num_prods_12, num_prods_12> b12 = {
{{9.0198e-16, 0.46932117595418237389, -0.20099424927047284052, -0.04623946134063071740},
{5.31597895759871264183, 1.19926790417132231573, 0.01179296240992997031, 0.01108844528519167989},
{0.18188869982170434744, 0.05502798439925399070, 0.09351590770535414968, 0.00610700528898058230},
{-2.0861320e-13, -0.13181061013830184015, -0.02027855540589259079, -0.00675951846863086359}}};
// Coefficients for computing taylor approximant of order 18.
constexpr int num_prods_18 = 5;
array2d<double, num_prods_18, num_prods_18> b18 = {
{{0., -1.00365581030144618291e-01, -8.02924648241156932449e-03, -8.92138498045729985177e-04, 0.},
{0., 3.97849749499645077844e-01, 1.36783778460411720168e+00, 4.98289622525382669416e-01,
-6.37898194594723280150e-04},
{-1.09676396052962061844e+01, 1.68015813878906206114e+00, 5.71779846478865511061e-02, -6.98210122488052056106e-03,
3.34975017086070470649e-05},
{-9.04316832390810593223e-02, -6.76404519071381882256e-02, 6.75961301770459654925e-02, 2.95552570429315521194e-02,
-1.39180257516060693404e-05},
{0., 0., -9.23364619367118555360e-02, -1.69364939002081722752e-02, -1.40086798182036094347e-05}}};
// Threshold for different order of taylor approximant.
constexpr std::array<float, total_n_degs> thetas_float = {1.192092800768788e-07, 5.978858893805233e-04,
5.116619363445086e-02, 5.800524627688768e-01,
1.461661507209034e+00, 3.010066362817634e+00};
// Threshold for different order of taylor approximant.
constexpr std::array<double, total_n_degs> thetas_double = {2.220446049250313e-16, 2.580956802971767e-08,
3.397168839976962e-04, 4.991228871115323e-02,
2.996158913811580e-01, 1.090863719290036e+00};
#define MATRIX_EXP_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = MatrixExpCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("MatrixExp kernel compute failed."); \
return result; \
} \
break; \
}
#define MATRIX_EXP_COMPUTE_DIFF_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = MatrixExpDiffTypeCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("MatrixExp kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t MatrixExpCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMatrixExpInputNum, kMatrixExpOutputNum),
"[%s] check input and output number failed.", kMatrixExp);
KERNEL_HANDLE_ERROR(MatrixExpCheck(ctx), "[%s] check params failed.", kMatrixExp);
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
MATRIX_EXP_COMPUTE_CASE(DT_FLOAT, float, ctx)
MATRIX_EXP_COMPUTE_CASE(DT_DOUBLE, double, ctx)
MATRIX_EXP_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
MATRIX_EXP_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
MATRIX_EXP_COMPUTE_DIFF_CASE(DT_FLOAT16, Eigen::half, ctx)
default:
KERNEL_LOG_ERROR("MatrixExp kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MatrixExpCpuKernel::MatrixExpCheck(CpuKernelContext &ctx) {
auto input_0 = ctx.Input(0);
std::vector<int64_t> shape_x = input_0->GetTensorShape()->GetDimSizes();
size_t shape_size_x = shape_x.size();
KERNEL_CHECK_FALSE((shape_size_x > 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 2, got [%zu].",
shape_size_x)
KERNEL_CHECK_FALSE((shape_x[shape_size_x - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
"Input x's last dimension must be at least 1.")
KERNEL_CHECK_FALSE((shape_x[shape_size_x - kIndexTwo] == shape_x[shape_size_x - 1]), KERNEL_STATUS_PARAM_INVALID,
"Input x's last two dimensions must be equal, but are [%lld] and [%lld].",
shape_x[shape_size_x - kIndexTwo], shape_x[shape_size_x - 1])
return KERNEL_STATUS_OK;
}
template <typename Derived1, typename Derived2, typename Derived3>
void MatrixExpCpuKernel::MTaylorApproximant(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
int order, Eigen::MatrixBase<Derived3> &E) {
constexpr int expension_order_1 = 1;
constexpr int expension_order_2 = 2;
constexpr int expension_order_4 = 4;
constexpr int expension_order_8 = 8;
constexpr int expension_order_12 = 12;
auto A2 = A * A;
auto A3 = A * A2;
if (order == expension_order_1) {
E = I + A;
} else if (order == expension_order_2) {
constexpr int A2_divisor = 2;
E = I + A + A2 / A2_divisor;
} else if (order == expension_order_4) {
constexpr int I_divisor = 2;
constexpr int A_divisor = 6;
constexpr int A2_divisor = 24;
E = I + A + A2 * (I / I_divisor + A / A_divisor + A2 / A2_divisor);
} else if (order == expension_order_8) {
auto A4 = A2 * (x1 * A + x2 * A2);
auto A8 = (x3 * A2 + A4) * (x4 * I + x5 * A + x6 * A2 + x7 * A4);
E = I + A + y2 * A2 + A8;
} else if (order == expension_order_12) {
auto q31 = b12[0][0] * I + b12[0][1] * A + b12[0][2] * A2 + b12[0][3] * A3;
auto q32 = b12[1][0] * I + b12[1][1] * A + b12[1][2] * A2 + b12[1][3] * A3;
auto q33 = b12[2][0] * I + b12[2][1] * A + b12[2][2] * A2 + b12[2][3] * A3;
auto q34 = b12[3][0] * I + b12[3][1] * A + b12[3][2] * A2 + b12[3][3] * A3;
auto q61 = q33 + q34 * q34;
E = q31 + (q32 + q61) * q61;
} else {
auto A6 = A3 * A3;
auto q31 = b18[0][0] * I + b18[0][1] * A + b18[0][2] * A2 + b18[0][3] * A3 + b18[0][4] * A6;
auto q61 = b18[1][0] * I + b18[1][1] * A + b18[1][2] * A2 + b18[1][3] * A3 + b18[1][4] * A6;
auto q62 = b18[2][0] * I + b18[2][1] * A + b18[2][2] * A2 + b18[2][3] * A3 + b18[2][4] * A6;
auto q63 = b18[3][0] * I + b18[3][1] * A + b18[3][2] * A2 + b18[3][3] * A3 + b18[3][4] * A6;
auto q64 = b18[4][0] * I + b18[4][1] * A + b18[4][2] * A2 + b18[4][3] * A3 + b18[4][4] * A6;
auto q91 = q31 * q64 + q63;
E = q61 + (q62 + q91) * q91;
}
}
template <typename Derived1, typename Derived2>
void MatrixExpCpuKernel::MexpImpl(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
Eigen::MatrixBase<Derived1> &mexp, CpuKernelContext &ctx) {
const auto norm = A.cwiseAbs().colwise().sum().maxCoeff();
constexpr std::array<int, total_n_degs> m_vals = {1, 2, 4, 8, 12, 18};
constexpr int cut_deg = 2;
int64_t s = -1;
auto data_type = ctx.Input(0)->GetDataType();
if (data_type == DT_FLOAT16 || data_type == DT_FLOAT || data_type == DT_COMPLEX64) {
for (int i = 0; i < total_n_degs - 1; i++) {
if (norm <= thetas_float[i]) {
MTaylorApproximant(A, I, m_vals[i], mexp);
break;
}
}
if (norm >= thetas_float[total_n_degs - cut_deg]) {
s = ceil(log2(norm / thetas_float[total_n_degs - 1]));
if (s <= 0) {
s = 0;
}
}
} else {
for (int i = 0; i < total_n_degs - 1; i++) {
if (norm <= thetas_double[i]) {
MTaylorApproximant(A, I, m_vals[i], mexp);
break;
}
}
if (norm >= thetas_double[total_n_degs - cut_deg]) {
s = ceil(log2(norm / thetas_double[total_n_degs - 1]));
if (s <= 0) {
s = 0;
}
}
}
if (s >= 0) {
const auto pow2s = pow(2, s);
const auto A_scaled = A / pow2s;
MTaylorApproximant(A_scaled, I, m_vals[total_n_degs - 1], mexp);
for (int k = 0; k < s; k++) {
mexp = mexp * mexp;
}
}
}
template <typename T>
uint32_t MatrixExpCpuKernel::MatrixExpCompute(CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
size_t shape_size = shape_x.size();
int64_t m = shape_x[shape_size - 1];
int64_t size_mm = m * m;
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
MatrixXd I(m, m);
I.setIdentity();
int64_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
if (data_size <= paralled_data_size) {
for (int64_t i = 0; i < matrix_num; i++) {
Eigen::Map<MatrixXd> matrix_x(input_x + i * m * m, m, m);
Eigen::Map<MatrixXd> matrix_y(output_y + i * m * m, m, m);
if (matrix_x.size() > 0) {
MexpImpl(matrix_x, I, matrix_y, ctx);
}
}
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num == 0) {
return KERNEL_STATUS_PARAM_INVALID;
}
if (max_core_num > matrix_num) {
max_core_num = matrix_num;
}
auto shard_work = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
Eigen::Map<MatrixXd> matrix_x(input_x + i * m * m, m, m);
Eigen::Map<MatrixXd> matrix_y(output_y + i * m * m, m, m);
if (matrix_x.size() > 0) {
MexpImpl(matrix_x, I, matrix_y, ctx);
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_work),
"MatrixExp Compute failed.");
}
return KERNEL_STATUS_OK;
}
void MatrixExpCpuKernel::TyepChangeForFp16(int64_t i, int64_t m, Eigen::half *input_x, Eigen::half *output_y,
CpuKernelContext &ctx) {
typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
MatrixXd I(m, m);
(void)I.setIdentity();
MatrixXd matrix_x(m, m);
MatrixXd matrix_y(m, m);
int64_t size_mm = m * m;
for (int p = 0; p < m; p++) {
for (int q = 0; q < m; q++) {
matrix_x(p, q) = static_cast<float>(input_x[i * size_mm + p * m + q]);
}
}
if (matrix_x.size() > 0) {
MexpImpl(matrix_x, I, matrix_y, ctx);
}
for (int p = 0; p < m; p++) {
for (int q = 0; q < m; q++) {
output_y[i * size_mm + p * m + q] = static_cast<Eigen::half>(matrix_y(p, q));
}
}
}
template <typename T>
uint32_t MatrixExpCpuKernel::MatrixExpDiffTypeCompute(CpuKernelContext &ctx) {
T *input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
size_t shape_size = shape_x.size();
int64_t m = shape_x[shape_size - 1];
int64_t size_mm = m * m;
int64_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
if (data_size <= paralled_data_size) {
for (int64_t i = 0; i < matrix_num; i++) {
TyepChangeForFp16(i, m, input_x, output_y, ctx);
}
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num == 0) {
return KERNEL_STATUS_PARAM_INVALID;
}
if (max_core_num > matrix_num) {
max_core_num = matrix_num;
}
auto shard_work = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
TyepChangeForFp16(i, m, input_x, output_y, ctx);
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_work),
"MatrixExp Compute failed.");
}
// }
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMatrixExp, MatrixExpCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,50 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_EXP_H_
#define AICPU_KERNELS_NORMALIZED_MATRIX_EXP_H_
#include "cpu_ops_kernel.h"
#include "utils/eigen_tensor.h"
namespace aicpu {
class MatrixExpCpuKernel : public CpuKernel {
public:
MatrixExpCpuKernel() = default;
~MatrixExpCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t MatrixExpCheck(CpuKernelContext &ctx);
template <typename Derived1, typename Derived2, typename Derived3>
void MTaylorApproximant(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I, int order,
Eigen::MatrixBase<Derived3> &E);
template <typename Derived1, typename Derived2>
void MexpImpl(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
Eigen::MatrixBase<Derived1> &mexp, CpuKernelContext &ctx);
template <typename T>
uint32_t MatrixExpCompute(CpuKernelContext &ctx);
void TyepChangeForFp16(int64_t i, int64_t m, Eigen::half *input_x, Eigen::half *output_y, CpuKernelContext &ctx);
template <typename T>
uint32_t MatrixExpDiffTypeCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,460 @@
/**
* Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unminimum required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "maximum.h"
#include "Eigen/Dense"
#include "cmath"
#include "cpu_kernel_utils.h"
#include "iostream"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
const char *kMaximum = "Maximum";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
#define MAXIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = MaximumCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Maximum kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t MaximumCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Maximum check input and output number failed.");
KERNEL_HANDLE_ERROR(MaximumParamCheck(ctx), "Maximum check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
MAXIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
MAXIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
MAXIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
MAXIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
MAXIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("Maximum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MaximumCpuKernel::MaximumParamCheck(CpuKernelContext &ctx) {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"MaximumCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T>
void MaximumCpuKernel::SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto ignore_nan = false;
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
for (int64_t i = start; i < end; ++i) {
if (ignore_nan == true && is_float16 == true) {
if (Eigen::numext::isnan(*(input1 + i))) {
*(output + i) = *(input2 + i);
} else if (Eigen::numext::isnan(*(input2 + i))) {
*(output + i) = *(input1 + i);
} else {
*(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
}
}
if (ignore_nan == true && is_float16 == false) {
if (isnan(*(input1 + i))) {
*(output + i) = *(input2 + i);
} else if (isnan(*(input2 + i))) {
*(output + i) = *(input1 + i);
} else {
*(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
}
}
if (ignore_nan == false && is_float16 == true) {
if (Eigen::numext::isnan(*(input1 + i))) {
*(output + i) = *(input1 + i);
} else if (Eigen::numext::isnan(*(input2 + i))) {
*(output + i) = *(input2 + i);
} else {
*(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
}
}
if (ignore_nan == false && is_float16 == false) {
if (isnan(*(input1 + i))) {
*(output + i) = *(input1 + i);
} else if (isnan(*(input2 + i))) {
*(output + i) = *(input2 + i);
} else {
*(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
}
}
}
}
template <typename T>
void MaximumCpuKernel::SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto ignore_nan = false;
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
for (int64_t i = start; i < end; ++i) {
if (ignore_nan == true && is_float16 == true) {
if (Eigen::numext::isnan(*(input1))) {
*(output + i) = *(input2 + i);
} else if (Eigen::numext::isnan(*(input2 + i))) {
*(output + i) = *(input1);
} else {
*(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
}
}
if (ignore_nan == true && is_float16 == false) {
if (isnan(*(input1))) {
*(output + i) = *(input2 + i);
} else if (isnan(*(input2 + i))) {
*(output + i) = *(input1);
} else {
*(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
}
}
if (ignore_nan == false && is_float16 == true) {
if (Eigen::numext::isnan(*(input1))) {
*(output + i) = *(input1);
} else if (Eigen::numext::isnan(*(input2 + i))) {
*(output + i) = *(input2 + i);
} else {
*(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
}
}
if (ignore_nan == false && is_float16 == false) {
if (isnan(*(input1))) {
*(output + i) = *(input1);
} else if (isnan(*(input2 + i))) {
*(output + i) = *(input2 + i);
} else {
*(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
}
}
}
}
template <typename T>
void MaximumCpuKernel::SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto ignore_nan = false;
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
for (int64_t i = start; i < end; ++i) {
if (ignore_nan == true && is_float16 == true) {
if (Eigen::numext::isnan(*(input1 + i))) {
*(output + i) = *(input2);
} else if (Eigen::numext::isnan(*(input2))) {
*(output + i) = *(input1 + i);
} else {
*(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
}
}
if (ignore_nan == true && is_float16 == false) {
if (isnan(*(input1 + i))) {
*(output + i) = *(input2);
} else if (isnan(*(input2))) {
*(output + i) = *(input1 + i);
} else {
*(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
}
}
if (ignore_nan == false && is_float16 == true) {
if (Eigen::numext::isnan(*(input1 + i))) {
*(output + i) = *(input1 + i);
} else if (Eigen::numext::isnan(*(input2))) {
*(output + i) = *(input2);
} else {
*(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
}
}
if (ignore_nan == false && is_float16 == false) {
if (isnan(*(input1 + i))) {
*(output + i) = *(input1 + i);
} else if (isnan(*(input2))) {
*(output + i) = *(input2);
} else {
*(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
}
}
}
}
template <typename T>
void MaximumCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
bool is_float16 = false;
if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
std::is_same<T, double>::value) {
is_float16 = false;
} else {
is_float16 = true;
}
switch (type) {
case BcastShapeType::SAME_SHAPE:
SpecialComputeSameShape<T>(start, end, ctx, is_float16);
break;
case BcastShapeType::X_ONE_ELEMENT:
SpecialComputeXOneElement<T>(start, end, ctx, is_float16);
break;
case BcastShapeType::Y_ONE_ELEMENT:
SpecialComputeYOneElement<T>(start, end, ctx, is_float16);
break;
default:
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
}
template <typename T>
uint32_t MaximumCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type = in0_elements_nums == in1_elements_nums
? BcastShapeType::SAME_SHAPE
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_fmax = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_fmax),
"Maximum Compute failed.");
} else {
SpecialCompute<T>(type, 0, data_num, ctx);
}
return KERNEL_STATUS_OK;
}
template <typename T>
void MaximumCpuKernel::BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast,
bool is_float16) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto ignore_nan = false;
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
for (int64_t i = start; i < end; ++i) {
if (ignore_nan == true && is_float16 == true) {
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == true && is_float16 == false) {
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == false && is_float16 == true) {
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == false && is_float16 == false) {
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
}
}
template <typename T>
void MaximumCpuKernel::BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto ignore_nan = false;
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
int64_t data_num = ctx.Output(0)->NumElements();
for (int64_t i = 0; i < data_num; ++i) {
if (ignore_nan == true && is_float16 == true) {
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == true && is_float16 == false) {
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == false && is_float16 == true) {
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == false && is_float16 == false) {
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
}
}
template <typename T>
uint32_t MaximumCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
int64_t data_num = ctx.Output(0)->NumElements();
bool is_float16 = false;
if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
std::is_same<T, double>::value) {
is_float16 = false;
} else {
is_float16 = true;
}
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_fmax = [&](int64_t start, int64_t end) {
BcastComputeMultiKernel<T>(start, end, ctx, bcast, is_float16);
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_fmax),
"Maximum Compute failed.");
} else {
BcastComputeOneKernel<T>(ctx, bcast, is_float16);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MaximumCpuKernel::MaximumCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (isNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMaximum, MaximumCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,63 @@
/**
* Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unminimum required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MAXIMUM_H_
#define AICPU_KERNELS_NORMALIZED_MAXIMUM_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class MaximumCpuKernel : public CpuKernel {
public:
MaximumCpuKernel() = default;
~MaximumCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t MaximumParamCheck(CpuKernelContext &ctx);
template <typename T>
void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
template <typename T>
void SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
template <typename T>
void SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
template <typename T>
void SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
void BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
template <typename T>
void BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
template <typename T>
uint32_t MaximumCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,456 @@
/**
* Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unminimum required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minimum.h"
#include "Eigen/Dense"
#include "cmath"
#include "cpu_kernel_utils.h"
#include "iostream"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
const char *kMinimum = "Minimum";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
#define MINIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = MinimumCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Minimum kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t MinimumCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Minimum check input and output number failed.");
KERNEL_HANDLE_ERROR(MinimumParamCheck(ctx), "Minimum check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
MINIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
MINIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
MINIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
MINIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
MINIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("Minimum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MinimumCpuKernel::MinimumParamCheck(CpuKernelContext &ctx) {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"MinimumCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T>
void MinimumCpuKernel::SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto ignore_nan = false;
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
for (int64_t i = start; i < end; ++i) {
if (ignore_nan == false && is_float16 == true) {
if (Eigen::numext::isnan(*(input1 + i))) {
*(output + i) = *(input1 + i);
} else if (Eigen::numext::isnan(*(input2 + i))) {
*(output + i) = *(input2 + i);
} else {
*(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
}
}
if (ignore_nan == false && is_float16 == false) {
if (isnan(*(input1 + i))) {
*(output + i) = *(input1 + i);
} else if (isnan(*(input2 + i))) {
*(output + i) = *(input2 + i);
} else {
*(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
}
}
if (ignore_nan == true && is_float16 == true) {
if (Eigen::numext::isnan(*(input1 + i))) {
*(output + i) = *(input2 + i);
} else if (Eigen::numext::isnan(*(input2 + i))) {
*(output + i) = *(input1 + i);
} else {
*(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
}
}
if (ignore_nan == true && is_float16 == false) {
if (isnan(*(input1 + i))) {
*(output + i) = *(input2 + i);
} else if (isnan(*(input2 + i))) {
*(output + i) = *(input1 + i);
} else {
*(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
}
}
}
}
template <typename T>
void MinimumCpuKernel::SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto ignore_nan = false;
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
for (int64_t i = start; i < end; ++i) {
if (ignore_nan == false && is_float16 == true) {
if (Eigen::numext::isnan(*input1)) {
*(output + i) = *input1;
} else if (Eigen::numext::isnan(*(input2 + i))) {
*(output + i) = *(input2 + i);
} else {
*(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
}
}
if (ignore_nan == false && is_float16 == false) {
if (isnan(*input1)) {
*(output + i) = *input1;
} else if (isnan(*(input2 + i))) {
*(output + i) = *(input2 + i);
} else {
*(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
}
}
if (ignore_nan == true && is_float16 == true) {
if (Eigen::numext::isnan(*input1)) {
*(output + i) = *(input2 + i);
} else if (Eigen::numext::isnan(*(input2 + i))) {
*(output + i) = *input1;
} else {
*(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
}
}
if (ignore_nan == true && is_float16 == false) {
if (isnan(*input1)) {
*(output + i) = *(input2 + i);
} else if (isnan(*(input2 + i))) {
*(output + i) = *input1;
} else {
*(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
}
}
}
}
template <typename T>
void MinimumCpuKernel::SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto ignore_nan = false;
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
for (int64_t i = start; i < end; ++i) {
if (ignore_nan == false && is_float16 == true) {
if (Eigen::numext::isnan(*(input1 + i))) {
*(output + i) = *(input1 + i);
} else if (Eigen::numext::isnan(*input2)) {
*(output + i) = *input2;
} else {
*(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
}
}
if (ignore_nan == false && is_float16 == false) {
if (isnan(*(input1 + i))) {
*(output + i) = *(input1 + i);
} else if (isnan(*input2)) {
*(output + i) = *input2;
} else {
*(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
}
}
if (ignore_nan == true && is_float16 == true) {
if (Eigen::numext::isnan(*(input1 + i))) {
*(output + i) = *input2;
} else if (Eigen::numext::isnan(*input2)) {
*(output + i) = *(input1 + i);
} else {
*(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
}
}
if (ignore_nan == true && is_float16 == false) {
if (isnan(*(input1 + i))) {
*(output + i) = *input2;
} else if (isnan(*input2)) {
*(output + i) = *(input1 + i);
} else {
*(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
}
}
}
}
template <typename T>
void MinimumCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
bool is_float16 = false;
if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
std::is_same<T, double>::value) {
is_float16 = false;
} else {
is_float16 = true;
}
switch (type) {
case BcastShapeType::SAME_SHAPE:
SpecialComputeSameShape<T>(start, end, ctx, is_float16);
break;
case BcastShapeType::X_ONE_ELEMENT:
SpecialComputeXOneElement<T>(start, end, ctx, is_float16);
break;
case BcastShapeType::Y_ONE_ELEMENT:
SpecialComputeYOneElement<T>(start, end, ctx, is_float16);
break;
default:
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
}
template <typename T>
uint32_t MinimumCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type = in0_elements_nums == in1_elements_nums
? BcastShapeType::SAME_SHAPE
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_minimum = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_minimum),
"Minimum Compute failed.");
} else {
SpecialCompute<T>(type, 0, data_num, ctx);
}
return KERNEL_STATUS_OK;
}
template <typename T>
void MinimumCpuKernel::BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast,
bool is_float16) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto ignore_nan = false;
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
for (int64_t i = start; i < end; ++i) {
if (ignore_nan == false && is_float16 == true) {
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == false && is_float16 == false) {
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == true && is_float16 == true) {
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == true && is_float16 == false) {
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
}
}
template <typename T>
void MinimumCpuKernel::BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto ignore_nan = false;
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
int64_t data_num = ctx.Output(0)->NumElements();
for (int64_t i = 0; i < data_num; ++i) {
if (ignore_nan == false && is_float16 == true) {
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == false && is_float16 == false) {
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == true && is_float16 == true) {
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
if (ignore_nan == true && is_float16 == false) {
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
? *(in0 + bcast.GetBroadcastXIndex(i))
: *(in1 + bcast.GetBroadcastYIndex(i));
}
}
}
}
template <typename T>
uint32_t MinimumCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
int64_t data_num = ctx.Output(0)->NumElements();
bool is_float16 = false;
if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
std::is_same<T, double>::value) {
is_float16 = false;
} else {
is_float16 = true;
}
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_minimum = [&](int64_t start, int64_t end) {
BcastComputeMultiKernel<T>(start, end, ctx, bcast, is_float16);
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_minimum),
"Minimum Compute failed.");
} else {
BcastComputeOneKernel<T>(ctx, bcast, is_float16);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MinimumCpuKernel::MinimumCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (isNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMinimum, MinimumCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,63 @@
/**
* Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unminimum required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MINIMUM_H_
#define AICPU_KERNELS_NORMALIZED_MINIMUM_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class MinimumCpuKernel : public CpuKernel {
public:
MinimumCpuKernel() = default;
~MinimumCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t MinimumParamCheck(CpuKernelContext &ctx);
template <typename T>
void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
template <typename T>
void SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
template <typename T>
void SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
template <typename T>
void SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
void BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
template <typename T>
void BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
template <typename T>
uint32_t MinimumCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -1,5 +1,5 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.

View File

@ -171,11 +171,8 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
mindspore::kKLDivOpName,
mindspore::kKlDivLossGradOpName,
mindspore::kLcmOpName,
mindspore::kLessEqualOpName,
mindspore::kLogicalXorOpName,
mindspore::kLogitOpName,
mindspore::kLogitGradOpName,
mindspore::kLogNormalReverseOpName,
mindspore::kLowerBoundOpName,
mindspore::kLstsqOpName,
mindspore::kLuUnpackOpName,