forked from mindspore-Ecosystem/mindspore
second half of 0103 aicpu migration without IsInf, del less_equal, log_normal_reverse, logical_xor
This commit is contained in:
parent
680f21a547
commit
3f71793aa3
|
@ -0,0 +1,228 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "hypot.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kHypot = "Hypot";
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
|
||||
#define HYPOT_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = HypotCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Hypot kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
T hypot(T a, T b) {
|
||||
return std::hypot(a, b);
|
||||
}
|
||||
|
||||
uint32_t HypotCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Hypot check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(HypotParamCheck(ctx), "Hypot check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
HYPOT_COMPUTE_CASE(DT_FLOAT, float_t, ctx)
|
||||
HYPOT_COMPUTE_CASE(DT_DOUBLE, double_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Hypot kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t HypotCpuKernel::HypotParamCheck(CpuKernelContext &ctx) {
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"HypotCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t HypotCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
BcastShapeType type;
|
||||
|
||||
if (in0_elements_nums == in1_elements_nums) {
|
||||
type = BcastShapeType::SAME_SHAPE;
|
||||
} else {
|
||||
type = (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
|
||||
}
|
||||
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_hypot = [&](int64_t start, int64_t end) {
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) = hypot(*(in0 + i), *(in1 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) = hypot(*in0, *(in1 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) = hypot(*(in0 + i), *in1);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_hypot),
|
||||
"Hypot Compute failed.");
|
||||
} else {
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
|
||||
*(out + i) = hypot(*(in0 + i), *(in1 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
|
||||
*(out + i) = hypot(*in0, *(in1 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
|
||||
*(out + i) = hypot(*(in0 + i), *in1);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t HypotCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
T *in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
T *in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
T *out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_hypot = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) = hypot<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
|
||||
}
|
||||
};
|
||||
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_hypot),
|
||||
"Hypot Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
*(out + i) = hypot<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t HypotCpuKernel::HypotCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
|
||||
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
|
||||
if (isNeedBcast) {
|
||||
return NoBcastCompute<T>(ctx);
|
||||
} else {
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kHypot, HypotCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_HYPOT_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_HYPOT_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
namespace aicpu {
|
||||
class HypotCpuKernel : public CpuKernel {
|
||||
public:
|
||||
HypotCpuKernel() = default;
|
||||
~HypotCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t HypotParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
|
||||
template <typename T>
|
||||
uint32_t HypotCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,81 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "identityn.h"
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "securec.h"
|
||||
#include "status.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kIdentityN = "IdentityN";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t IdentityNCpuKernel::IdentityNParamCheck(CpuKernelContext &ctx) {
|
||||
// input size and output size check
|
||||
uint32_t input_size = ctx.GetInputsSize();
|
||||
uint32_t output_size = ctx.GetOutputsSize();
|
||||
KERNEL_CHECK_FALSE((input_size == output_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input size should equal to Output size.");
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_size, output_size), "[%s] check params failed.", kIdentityN);
|
||||
for (uint32_t idx = 0; idx < input_size; ++idx) {
|
||||
Tensor *in_tensor = ctx.Input(idx);
|
||||
Tensor *out_tensor = ctx.Output(idx);
|
||||
// TensorShape check
|
||||
auto in_shape = in_tensor->GetTensorShape();
|
||||
auto out_shape = out_tensor->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((in_shape->GetDimSizes() == out_shape->GetDimSizes()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"In tensor shape should equal to out tensor shape.");
|
||||
// DataType Check
|
||||
DataType in_type = in_tensor->GetDataType();
|
||||
DataType out_type = out_tensor->GetDataType();
|
||||
KERNEL_CHECK_FALSE((in_type == out_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"In tensor data type should equal to out tensor data type.");
|
||||
bool type_support =
|
||||
std::find(support_data_type.begin(), support_data_type.end(), in_type) != support_data_type.end();
|
||||
KERNEL_CHECK_FALSE(type_support, KERNEL_STATUS_PARAM_INVALID, "IdentityN kernel data type [%s] not support.",
|
||||
DTypeStr(in_type).c_str());
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t IdentityNCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(IdentityNParamCheck(ctx), "IdentityNCpuKernel check params failed");
|
||||
uint32_t input_size = ctx.GetInputsSize();
|
||||
for (uint32_t idx = 0; idx < input_size; ++idx) {
|
||||
Tensor *in_tensor = ctx.Input(idx);
|
||||
Tensor *out_tensor = ctx.Output(idx);
|
||||
auto in_data = in_tensor->GetData();
|
||||
auto out_data = out_tensor->GetData();
|
||||
uint64_t in_size = in_tensor->GetDataSize();
|
||||
uint64_t out_size = out_tensor->GetDataSize();
|
||||
|
||||
// memory copy
|
||||
if (out_data != in_data) {
|
||||
int cpret = memcpy_s(out_data, out_size, in_data, in_size);
|
||||
KERNEL_CHECK_FALSE((cpret == EOK), KERNEL_STATUS_INNER_ERROR,
|
||||
"[%s] memcpy_s to output failed, destMax [%ld], count [%ld].", kIdentityN, out_size, in_size);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kIdentityN, IdentityNCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,36 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_IDENTITY_N_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_IDENTITY_N_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class IdentityNCpuKernel : public CpuKernel {
|
||||
public:
|
||||
IdentityNCpuKernel() = default;
|
||||
~IdentityNCpuKernel() = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t IdentityNParamCheck(CpuKernelContext &ctx);
|
||||
const std::vector<DataType> support_data_type = {DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8,
|
||||
DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE};
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,230 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "index_fill.h"
|
||||
|
||||
#include <securec.h>
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kNumInput = 4;
|
||||
const uint32_t kNumOutput = 1;
|
||||
const char *kIndexFill = "IndexFill";
|
||||
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const uint32_t kParallelDataNum = 16 * 1024;
|
||||
const uint32_t kParallelDataNumMid = 128 * 1024;
|
||||
|
||||
#define INDEXFILL_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = DoCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("IndexFill kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t IndexFillCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kNumInput, kNumOutput), "IndexFill check input and output number failed.");
|
||||
// get input Tensors
|
||||
for (uint32_t i = 0; i < kNumInput; ++i) {
|
||||
Tensor *tensor = ctx.Input(i);
|
||||
inputs_.push_back(tensor);
|
||||
}
|
||||
// get output Tensors
|
||||
Tensor *tensor = ctx.Output(0);
|
||||
outputs_.push_back(tensor);
|
||||
|
||||
int32_t value_dim = inputs_[3]->GetTensorShape()->GetDims();
|
||||
|
||||
KERNEL_CHECK_FALSE((value_dim == 0), KERNEL_STATUS_INNER_ERROR,
|
||||
"IndexFill only supports a 0-dimensional value tensor, "
|
||||
"but got tensor with [%d] dimension(s).",
|
||||
value_dim)
|
||||
|
||||
DataType dim_type = inputs_[1]->GetDataType();
|
||||
DataType index_type = inputs_[2]->GetDataType();
|
||||
|
||||
if (dim_type != DT_INT32) {
|
||||
KERNEL_LOG_ERROR("IndexFill: Expected dtype int32 for dim.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (index_type != DT_INT32) {
|
||||
KERNEL_LOG_ERROR("IndexFill: Expected dtype int32 for index.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void IndexFillCpuKernel::SpecialCompute(int64_t start, int64_t end, const int32_t *input_dim,
|
||||
std::map<int32_t, bool> &index_dict) {
|
||||
auto *input_x = reinterpret_cast<T *>(inputs_[0]->GetData());
|
||||
auto *input_value = reinterpret_cast<T *>(inputs_[3]->GetData());
|
||||
auto *output_y = reinterpret_cast<T *>(outputs_[0]->GetData());
|
||||
int32_t x_dim_nums = inputs_[0]->GetTensorShape()->GetDims();
|
||||
auto x_dims = inputs_[0]->GetTensorShape()->GetDimSizes();
|
||||
|
||||
int32_t dim_flag;
|
||||
if (x_dim_nums != 0) {
|
||||
dim_flag = *input_dim % x_dim_nums + 1;
|
||||
} else {
|
||||
dim_flag = 0;
|
||||
}
|
||||
|
||||
int32_t remain_dims = 1;
|
||||
if (dim_flag == x_dim_nums) {
|
||||
if (dim_flag != 0) {
|
||||
remain_dims = x_dims[*input_dim];
|
||||
}
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
int32_t index_flag = i % remain_dims;
|
||||
std::map<int32_t, bool>::iterator f = index_dict.find(index_flag);
|
||||
if (f != index_dict.end()) {
|
||||
output_y[i] = *input_value;
|
||||
} else {
|
||||
output_y[i] = input_x[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int32_t i = *input_dim + 1; i < x_dim_nums; i++) {
|
||||
remain_dims *= x_dims[i];
|
||||
}
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
int32_t index_flag = (i / remain_dims) % x_dims[*input_dim];
|
||||
std::map<int32_t, bool>::iterator f = index_dict.find(index_flag);
|
||||
if (f != index_dict.end()) {
|
||||
output_y[i] = *input_value;
|
||||
} else {
|
||||
output_y[i] = input_x[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t IndexFillCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
int32_t *input_1 = reinterpret_cast<int32_t *>(inputs_[1]->GetData());
|
||||
int32_t *input_2 = reinterpret_cast<int32_t *>(inputs_[2]->GetData());
|
||||
|
||||
int32_t x_dim_nums = inputs_[0]->GetTensorShape()->GetDims();
|
||||
int32_t dim_nums = inputs_[1]->GetTensorShape()->GetDims();
|
||||
int32_t index_dim_nums = inputs_[2]->GetTensorShape()->GetDims();
|
||||
auto x_dims = inputs_[0]->GetTensorShape()->GetDimSizes();
|
||||
|
||||
uint32_t data_num = outputs_[0]->NumElements();
|
||||
int64_t index_num = inputs_[2]->GetTensorShape()->NumElements();
|
||||
|
||||
KERNEL_CHECK_FALSE(dim_nums == 0, KERNEL_STATUS_PARAM_INVALID, "Dim has to be a scalar.")
|
||||
KERNEL_CHECK_FALSE(index_dim_nums <= 1, KERNEL_STATUS_PARAM_INVALID, "Index has to be a vector/scalar.")
|
||||
|
||||
int32_t cur_dim = *input_1;
|
||||
if (*input_1 < 0) {
|
||||
*input_1 = *input_1 + x_dim_nums;
|
||||
}
|
||||
|
||||
std::map<int32_t, bool> index_dict;
|
||||
if (x_dim_nums == 0) {
|
||||
for (int32_t i = 0; i < index_num; i++) {
|
||||
if (input_2[i] < -1 || input_2[i] > 0) {
|
||||
KERNEL_LOG_ERROR("Invalid argument 3: out of range.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
index_dict.insert(std::pair<int32_t, bool>(0, true));
|
||||
}
|
||||
}
|
||||
} else if (cur_dim < -x_dim_nums || cur_dim >= x_dim_nums) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"Dimension out of range (expected to be in range of "
|
||||
"[%d, %d], but got %d).",
|
||||
0 - x_dim_nums, x_dim_nums - 1, cur_dim);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
for (int32_t i = 0; i < index_num; i++) {
|
||||
if (input_2[i] < -x_dims[*input_1] || input_2[i] >= x_dims[*input_1]) {
|
||||
KERNEL_LOG_ERROR("Invalid argument 3: out of range.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
input_2[i] = (input_2[i] < 0) ? (input_2[i] + x_dims[*input_1]) : input_2[i];
|
||||
index_dict.insert(std::pair<int32_t, bool>(input_2[i], true));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("The number of available CPU cores must be greater than 0!");
|
||||
}
|
||||
|
||||
auto sharder_index_fill = [&](int64_t start, int64_t end) { SpecialCompute<T>(start, end, input_1, index_dict); };
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_index_fill),
|
||||
"IndexFill Compute failed.");
|
||||
} else {
|
||||
SpecialCompute<T>(0, data_num, input_1, index_dict);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t IndexFillCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
uint32_t res = GetInputAndCheck(ctx);
|
||||
if (res != KERNEL_STATUS_OK) {
|
||||
return res;
|
||||
}
|
||||
|
||||
DataType input_type{ctx.Input(0)->GetDataType()};
|
||||
switch (input_type) {
|
||||
INDEXFILL_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
INDEXFILL_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
INDEXFILL_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
INDEXFILL_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
INDEXFILL_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
INDEXFILL_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
INDEXFILL_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
|
||||
INDEXFILL_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
|
||||
INDEXFILL_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
INDEXFILL_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
INDEXFILL_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kIndexFill, IndexFillCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_INDEX_FILL_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_INDEX_FILL_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class IndexFillCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~IndexFillCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
uint32_t GetInputAndCheck(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
void SpecialCompute(int64_t start, int64_t end, const int32_t *input_dim, std::map<int32_t, bool> &index_dict);
|
||||
|
||||
std::vector<Tensor *> inputs_;
|
||||
std::vector<Tensor *> outputs_;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,185 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "kldiv.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const std::uint32_t kKLDivInputNum{2};
|
||||
const std::uint32_t kKLDivOutputNum{1};
|
||||
const std::int64_t ParallelNum{4096};
|
||||
const char *kKLDiv{"KLDiv"};
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
namespace detail {
|
||||
template <typename T>
|
||||
inline std::uint32_t ComputeKLDivKernel(const CpuKernelContext &ctx) {
|
||||
const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
|
||||
auto input = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto target = static_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto output = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
std::int64_t total = ctx.Input(0)->NumElements();
|
||||
std::size_t data_size = ctx.Input(0)->GetDataSize();
|
||||
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
|
||||
std::string reduction = ctx.GetAttr("reduction")->GetString();
|
||||
if (reduction != "sum" && reduction != "batchmean" && reduction != "none" && reduction != "mean") {
|
||||
KERNEL_LOG_ERROR("%s is not a valid value for reduction", reduction.c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
bool parallel_flag = false;
|
||||
if (data_size > ParallelNum * sizeof(T)) {
|
||||
parallel_flag = true;
|
||||
}
|
||||
if (cores == 0) {
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
T *tmp_array = nullptr;
|
||||
if (reduction == "none") {
|
||||
tmp_array = output;
|
||||
} else {
|
||||
tmp_array = new T[total];
|
||||
}
|
||||
if (parallel_flag) {
|
||||
std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
|
||||
ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
|
||||
std::int64_t length = end - begin;
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input + begin, length, 1);
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target + begin, length, 1);
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_reduce(tmp_array + begin, length, 1);
|
||||
T constant_zero{0};
|
||||
array_reduce = array_target * (Eigen::log(array_target) - array_input);
|
||||
for (std::int64_t idx = 0; idx < length; ++idx) {
|
||||
if (!(target[begin + idx] > constant_zero)) {
|
||||
array_reduce(idx) = constant_zero;
|
||||
}
|
||||
}
|
||||
});
|
||||
} else {
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input, total, 1);
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target, total, 1);
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_reduce(tmp_array, total, 1);
|
||||
array_reduce = array_target * (Eigen::log(array_target) - array_input);
|
||||
T constant_zero{0};
|
||||
for (uint32_t idx = 0; idx < total; ++idx) {
|
||||
if (!(target[idx] > constant_zero)) {
|
||||
array_reduce(idx) = constant_zero;
|
||||
}
|
||||
}
|
||||
}
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > reduce(tmp_array, total, 1);
|
||||
if (reduction == "sum") {
|
||||
output[0] = reduce.sum();
|
||||
} else if (reduction == "batchmean") {
|
||||
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
output[0] = reduce.sum() / T(input_dims[0]);
|
||||
} else if (reduction == "mean") {
|
||||
output[0] = reduce.mean();
|
||||
}
|
||||
if (reduction != "none") {
|
||||
delete[] tmp_array;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::uint32_t ComputeKLDiv(const CpuKernelContext &ctx) {
|
||||
uint32_t result = ComputeKLDivKernel<T>(ctx);
|
||||
if (result != 0) {
|
||||
KERNEL_LOG_ERROR("KLDiv compute failed.");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::uint32_t KLDivExtraCheck(const CpuKernelContext &ctx) {
|
||||
if (ctx.Input(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get input x data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(1)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get input target data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Output(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get output y data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
|
||||
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetDataSize() != ctx.Input(1)->GetDataSize()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data size of the input [%llu] need be the same as the target "
|
||||
"[%llu].",
|
||||
ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> target_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
if (input_dims.size() != target_dims.size()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data dim size of the input x [%llu] need be the same as the "
|
||||
"target "
|
||||
"[%llu].",
|
||||
input_dims.size(), target_dims.size());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
for (size_t index = 0; index < input_dims.size(); index++) {
|
||||
if (input_dims[index] != target_dims[index]) {
|
||||
KERNEL_LOG_ERROR("The data dim of the input x need be the same as the target.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
std::uint32_t KLDivCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
|
||||
return NormalCheck(ctx, kKLDivInputNum, kKLDivOutputNum, {"reduction"}) ? KERNEL_STATUS_PARAM_INVALID
|
||||
: KLDivExtraCheck(ctx);
|
||||
}
|
||||
// DT_FLOAT16, DT_FLOAT, DT_DOUBLE
|
||||
std::uint32_t KLDivCompute(const CpuKernelContext &ctx) {
|
||||
DataType input_type{ctx.Input(0)->GetDataType()};
|
||||
switch (input_type) {
|
||||
case DT_FLOAT16:
|
||||
return ComputeKLDiv<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return ComputeKLDiv<std::float_t>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return ComputeKLDiv<std::double_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
std::uint32_t KLDivCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
return detail::KLDivCheck(ctx, kKLDivInputNum, kKLDivOutputNum) ? KERNEL_STATUS_PARAM_INVALID
|
||||
: detail::KLDivCompute(ctx);
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kKLDiv, KLDivCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,27 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_KLDIV_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_KLDIV_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class KLDivCpuKernel final : public CpuKernel {
|
||||
virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,226 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "kldivlossgrad.h"
|
||||
|
||||
#include <complex>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kKlDivLossGrad = "KlDivLossGrad";
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 3;
|
||||
const uint32_t kGradIndex = 0;
|
||||
const uint32_t kInputIndex = 1;
|
||||
const uint32_t kTargetIndex = 2;
|
||||
const std::string AttrReduction = "reduction";
|
||||
const std::string AttrLog = "log_target";
|
||||
const int64_t DataDefaultParallelNum = 16384;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
void KlDivLossGradOp(Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &target,
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &grad,
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > &output, std::int64_t &len, bool &log_target,
|
||||
std::string &reduction) {
|
||||
T constant_zero{0};
|
||||
if (log_target) {
|
||||
output = -Eigen::exp(target) * grad;
|
||||
return;
|
||||
}
|
||||
if (reduction == "none") {
|
||||
for (uint32_t idx = 0; idx < len; ++idx) {
|
||||
if (target(idx) > constant_zero) {
|
||||
output(idx) = -target(idx) * grad(idx);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (uint32_t idx = 0; idx < len; ++idx) {
|
||||
if (target(idx) > constant_zero) {
|
||||
output(idx) = -target(idx) * grad(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
std::uint32_t KlDivLossGradExtraCheck(CpuKernelContext &ctx) {
|
||||
Tensor *grad = ctx.Input(0);
|
||||
Tensor *input = ctx.Input(1);
|
||||
Tensor *target = ctx.Input(2);
|
||||
Tensor *output = ctx.Output(0);
|
||||
if (grad->GetDataSize() == 0) {
|
||||
KERNEL_LOG_ERROR("[%s] grad is empty tensor.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (input->GetDataSize() == 0) {
|
||||
KERNEL_LOG_ERROR("[%s] input is empty tensor.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (target->GetDataSize() == 0) {
|
||||
KERNEL_LOG_ERROR("[%s] target is empty tensor.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (output->GetDataSize() == 0) {
|
||||
KERNEL_LOG_ERROR("[%s] output is empty tensor.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if ((input->GetDataType() != grad->GetDataType()) || (target->GetDataType() != grad->GetDataType()) ||
|
||||
(output->GetDataType() != grad->GetDataType())) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data type of the grad [%s], input [%s], target [%s], output y "
|
||||
"[%s] must be the same type.",
|
||||
DTypeStr(grad->GetDataType()).c_str(), DTypeStr(input->GetDataType()).c_str(),
|
||||
DTypeStr(target->GetDataType()).c_str(), DTypeStr(output->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
std::vector<int64_t> grad_dims = ctx.Input(kGradIndex)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> input_dims = ctx.Input(kInputIndex)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> target_dims = ctx.Input(kTargetIndex)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
|
||||
std::string reduction = ctx.GetAttr(AttrReduction)->GetString();
|
||||
if (output_dims != input_dims) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data shape of the output need be the same as the input. output "
|
||||
"shape [%s], input shape [%s]",
|
||||
VectorToString(output_dims).c_str(), VectorToString(input_dims).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (target_dims != input_dims) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data shape of the target need be the same as the input. target "
|
||||
"shape [%s], input shape [%s]",
|
||||
VectorToString(target_dims).c_str(), VectorToString(input_dims).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (reduction == "mean" || reduction == "sum" || reduction == "batchmean") {
|
||||
if (ctx.Input(0)->NumElements() != 1) {
|
||||
KERNEL_LOG_ERROR("The data num of the grad [%llu] must be 1", ctx.Input(0)->NumElements());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else if (reduction == "none") {
|
||||
if (input_dims != grad_dims) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data shape of the grad need be the same as the input. grad "
|
||||
"shape "
|
||||
"[%s], input shape [%s]",
|
||||
VectorToString(grad_dims).c_str(), VectorToString(input_dims).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t KlDivLossGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (KlDivLossGradExtraCheck(ctx) == KERNEL_STATUS_PARAM_INVALID) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
// choose compute function depend on dataType
|
||||
auto data_type = static_cast<DataType>(ctx.Input(kFirstInputIndex)->GetDataType());
|
||||
switch (data_type) {
|
||||
case DT_FLOAT16:
|
||||
return KlDivLossGradCompute<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return KlDivLossGradCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return KlDivLossGradCompute<double>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t KlDivLossGradCpuKernel::KlDivLossGradCompute(CpuKernelContext &ctx) {
|
||||
int64_t grad_total = ctx.Input(0)->NumElements();
|
||||
int64_t input_total = ctx.Input(1)->NumElements();
|
||||
int64_t target_total = ctx.Input(2)->NumElements();
|
||||
int64_t output_y_total = ctx.Output(0)->NumElements();
|
||||
int64_t total = input_total;
|
||||
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
|
||||
T *grad = (T *)(ctx.Input(0)->GetData());
|
||||
T *input = (T *)(ctx.Input(1)->GetData());
|
||||
T *target = (T *)(ctx.Input(2)->GetData());
|
||||
T *output = (T *)(ctx.Output(0)->GetData());
|
||||
bool parallel_flag = false;
|
||||
uint64_t data_size = ctx.Input(1)->GetDataSize();
|
||||
// Determine whether to enable multi-core parallel computing
|
||||
if (data_size > DataDefaultParallelNum * sizeof(T)) {
|
||||
parallel_flag = true;
|
||||
}
|
||||
// Eigen::Array
|
||||
bool log_target{false};
|
||||
if (ctx.GetAttr(AttrLog) != nullptr) {
|
||||
log_target = ctx.GetAttr(AttrLog)->GetBool();
|
||||
}
|
||||
std::string reduction{"mean"};
|
||||
if (ctx.GetAttr(AttrReduction) != nullptr) {
|
||||
reduction = ctx.GetAttr(AttrReduction)->GetString();
|
||||
}
|
||||
if (cores == 0) {
|
||||
KERNEL_LOG_ERROR("KlDivLossGrad compute failed.");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
if (parallel_flag) {
|
||||
const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
|
||||
std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
|
||||
auto shard_kldivlossgrad = [&](std::int64_t begin, std::int64_t end) {
|
||||
std::int64_t length = end - begin;
|
||||
std::int64_t grad_begin{0}, grad_length{grad_total};
|
||||
if (reduction == "none") {
|
||||
grad_begin = begin;
|
||||
grad_length = length;
|
||||
}
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_grad(grad + grad_begin, grad_length, 1);
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input + begin, length, 1);
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target + begin, length, 1);
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_output(output + begin, length, 1);
|
||||
T constant_zero{0};
|
||||
array_output = constant_zero;
|
||||
KlDivLossGradOp<T>(array_target, array_grad, array_output, length, log_target, reduction);
|
||||
if (reduction == "mean") {
|
||||
array_output = array_output / T(output_y_total);
|
||||
} else if (reduction == "batchmean") {
|
||||
std::vector<int64_t> input_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
array_output = array_output / T(input_dims[0]);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(ParallelFor(ctx, total, per_unit_size, shard_kldivlossgrad), "KlDivLossGrad Compute failed.");
|
||||
} else {
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_grad(grad, grad_total, 1);
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_input(input, input_total, 1);
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_target(target, target_total, 1);
|
||||
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > array_output(output, output_y_total, 1);
|
||||
T constant_zero{0};
|
||||
array_output = constant_zero;
|
||||
KlDivLossGradOp<T>(array_target, array_grad, array_output, output_y_total, log_target, reduction);
|
||||
if (reduction == "mean") {
|
||||
array_output = array_output / T(output_y_total);
|
||||
} else if (reduction == "batchmean") {
|
||||
std::vector<int64_t> input_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
array_output = array_output / T(input_dims[0]);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kKlDivLossGrad, KlDivLossGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
|
||||
#define AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
|
||||
#define EIGEN_USE_THREADS
|
||||
#define EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class KlDivLossGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
KlDivLossGradCpuKernel() = default;
|
||||
~KlDivLossGradCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief compute for all types
|
||||
* @param ctx cpu kernel context
|
||||
* @return status if success
|
||||
*/
|
||||
template <typename T>
|
||||
uint32_t KlDivLossGradCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H
|
|
@ -0,0 +1,173 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "lcm.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <set>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kLcmOutputNum = 1;
|
||||
const uint32_t kLcmInputNum = 2;
|
||||
const char *kLcm = "Lcm";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int32_t kInput_32_32 = 3;
|
||||
const int32_t kInput_32_64 = 2;
|
||||
const int32_t kInput_64_32 = 1;
|
||||
const int32_t kInput_64_64 = 0;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
// Simple recursive gcd.
|
||||
template <class T>
|
||||
T elewise_gcd(T a, T b) {
|
||||
if (b == 0) {
|
||||
return a;
|
||||
}
|
||||
return elewise_gcd(b, a % b);
|
||||
}
|
||||
// Simple lcm.
|
||||
template <typename T>
|
||||
T elewise_lcm(T a, T b) {
|
||||
T gcd_tmp = elewise_gcd<T>(a, b);
|
||||
if (gcd_tmp == 0) {
|
||||
return static_cast<T>(0);
|
||||
}
|
||||
return std::abs(a / gcd_tmp * b);
|
||||
}
|
||||
|
||||
uint32_t LcmIOTypeCheck(CpuKernelContext &ctx, int32_t &dual_types) {
|
||||
Tensor *x1 = ctx.Input(kFirstInputIndex);
|
||||
Tensor *x2 = ctx.Input(kSecondInputIndex);
|
||||
Tensor *y = ctx.Output(kFirstOutputIndex);
|
||||
const std::set<DataType> supported_types{DT_INT32, DT_INT64};
|
||||
auto x1_type = x1->GetDataType();
|
||||
auto x2_type = x2->GetDataType();
|
||||
auto y_type = y->GetDataType();
|
||||
KERNEL_CHECK_FALSE(supported_types.count(x1_type) != 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[Lcm] input x1 data type [%s] is not supported.", DTypeStr(x1_type).c_str());
|
||||
KERNEL_CHECK_FALSE(supported_types.count(x2_type) != 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[Lcm] input x2 data type [%s] is not supported.", DTypeStr(x2_type).c_str());
|
||||
int32_t x1_is_i32 = static_cast<int32_t>(x1_type == DT_INT32) << 1;
|
||||
int32_t x2_is_i32 = static_cast<int32_t>(x2_type == DT_INT32);
|
||||
int32_t _dual_types = x1_is_i32 | x2_is_i32;
|
||||
switch (_dual_types) {
|
||||
case kInput_64_64:
|
||||
case kInput_64_32:
|
||||
case kInput_32_64:
|
||||
KERNEL_CHECK_FALSE(y_type == DT_INT64, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[Lcm] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
|
||||
dual_types = _dual_types;
|
||||
break;
|
||||
case kInput_32_32:
|
||||
KERNEL_CHECK_FALSE(y_type == DT_INT32, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[Lcm] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
|
||||
dual_types = _dual_types;
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[Lcm] input data type tuple is not supported.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <class T1, class T2, class T3>
|
||||
uint32_t LcmElewiseCompute(CpuKernelContext &ctx, const T1 *x1_ptr, const T2 *x2_ptr, T3 *y_ptr, Bcast &bcast) {
|
||||
int64_t data_num = ctx.Output(kFirstOutputIndex)->NumElements();
|
||||
auto lcm_shard = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
T3 x1_ele_abs = std::abs(static_cast<T3>(x1_ptr[bcast.GetBroadcastXIndex(i)]));
|
||||
T3 x2_ele_abs = std::abs(static_cast<T3>(x2_ptr[bcast.GetBroadcastYIndex(i)]));
|
||||
y_ptr[i] = elewise_lcm(x1_ele_abs, x2_ele_abs);
|
||||
}
|
||||
};
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("[Lcm] max_core_num is 0, please check the cpu num.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, lcm_shard);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("[Lcm] Lcm Compute failed.");
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
lcm_shard(0, data_num);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <class T1, class T2, class T3>
|
||||
uint32_t LcmCompute(CpuKernelContext &ctx) {
|
||||
Tensor *x1 = ctx.Input(kFirstInputIndex);
|
||||
Tensor *x2 = ctx.Input(kSecondInputIndex);
|
||||
Tensor *y = ctx.Output(kFirstOutputIndex);
|
||||
const T1 *x1_ptr = reinterpret_cast<const T1 *>(x1->GetData());
|
||||
const T2 *x2_ptr = reinterpret_cast<const T2 *>(x2->GetData());
|
||||
T3 *y_ptr = reinterpret_cast<T3 *>(y->GetData());
|
||||
auto x1_shape = x1->GetTensorShape()->GetDimSizes();
|
||||
auto x2_shape = x2->GetTensorShape()->GetDimSizes();
|
||||
Bcast bcast(x1_shape, x2_shape);
|
||||
if (bcast.IsValid()) {
|
||||
return LcmElewiseCompute<T1, T2, T3>(ctx, x1_ptr, x2_ptr, y_ptr, bcast);
|
||||
} else {
|
||||
KERNEL_LOG_ERROR("[Lcm] broadcast failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t LcmCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kLcmInputNum, kLcmOutputNum), "[Lcm] check input and output number failed.");
|
||||
int32_t dual_types = static_cast<int32_t>(-1);
|
||||
KERNEL_HANDLE_ERROR(LcmIOTypeCheck(ctx, dual_types), "[Lcm] check data type failed.");
|
||||
switch (dual_types) {
|
||||
case kInput_64_64:
|
||||
return LcmCompute<int64_t, int64_t, int64_t>(ctx);
|
||||
break;
|
||||
case kInput_64_32:
|
||||
return LcmCompute<int64_t, int32_t, int64_t>(ctx);
|
||||
break;
|
||||
case kInput_32_64:
|
||||
return LcmCompute<int32_t, int64_t, int64_t>(ctx);
|
||||
break;
|
||||
case kInput_32_32:
|
||||
return LcmCompute<int32_t, int32_t, int32_t>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[Lcm] input data type tuple is not supported.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kLcm, LcmCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LCM_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_LCM_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class LcmCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~LcmCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,126 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "logit.h"
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "Eigen/Dense"
|
||||
#include "Eigen/LU"
|
||||
#include "cmath"
|
||||
#include "cpu_context.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 16 * 1024;
|
||||
const char *kLogit = "Logit";
|
||||
|
||||
#define LOGIT_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = LogitCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Logit kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t LogitCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLogit);
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
LOGIT_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
LOGIT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
LOGIT_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Logit kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogitCpuKernel::LogitCompute(CpuKernelContext &ctx) {
|
||||
auto input_tensor = ctx.Input(0);
|
||||
auto output_tensor = ctx.Output(0);
|
||||
auto input = reinterpret_cast<T *>(input_tensor->GetData());
|
||||
auto output = reinterpret_cast<T *>(output_tensor->GetData());
|
||||
AttrValue *attr = ctx.GetAttr("eps");
|
||||
float eps = -1.0;
|
||||
if (attr != nullptr) {
|
||||
eps = attr->GetFloat();
|
||||
}
|
||||
auto input_shape = input_tensor->GetTensorShape();
|
||||
int64_t data_num = input_shape->NumElements();
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U);
|
||||
}
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shared_less = [&](size_t start, size_t end) {
|
||||
T one = T(1);
|
||||
T up_bound = static_cast<T>(1) - static_cast<T>(eps);
|
||||
if (eps < 0) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T x = input[i];
|
||||
output[i] = log(x / (one - x));
|
||||
}
|
||||
} else {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T z;
|
||||
T x = input[i];
|
||||
z = x < static_cast<T>(eps) ? static_cast<T>(eps) : (x > up_bound ? up_bound : x);
|
||||
output[i] = log(z / (one - z));
|
||||
}
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max core num is 0");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_less),
|
||||
"Logit Compute failed.");
|
||||
} else {
|
||||
T one = T(1);
|
||||
T up_bound = static_cast<T>(1) - static_cast<T>(eps);
|
||||
if (eps < 0) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
T x = input[i];
|
||||
output[i] = log(x / (one - x));
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
T z;
|
||||
T x = input[i];
|
||||
z = x < static_cast<T>(eps) ? static_cast<T>(eps) : (x > up_bound ? up_bound : x);
|
||||
output[i] = log(z / (one - z));
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kLogit, LogitCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,36 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LOGIT_H
|
||||
#define AICPU_KERNELS_NORMALIZED_LOGIT_H
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class LogitCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LogitCpuKernel() = default;
|
||||
~LogitCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t LogitCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,133 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "logit_grad.h"
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "Eigen/Dense"
|
||||
#include "Eigen/LU"
|
||||
#include "cmath"
|
||||
#include "cpu_context.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 16 * 1024;
|
||||
const char *kLogitGrad = "LogitGrad";
|
||||
|
||||
#define LOGITGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = LogitGradCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("LogitGrad kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t LogitGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLogitGrad);
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
LOGITGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
LOGITGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
LOGITGRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("LogitGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogitGradCpuKernel::LogitGradCompute(CpuKernelContext &ctx) {
|
||||
auto input_y_grad_tensor = ctx.Input(0);
|
||||
auto input_x_tensor = ctx.Input(1);
|
||||
auto output_x_grad_tensor = ctx.Output(0);
|
||||
auto input_y_grad = reinterpret_cast<T *>(input_y_grad_tensor->GetData());
|
||||
auto input_x = reinterpret_cast<T *>(input_x_tensor->GetData());
|
||||
auto output_x_grad = reinterpret_cast<T *>(output_x_grad_tensor->GetData());
|
||||
auto input_shape = input_x_tensor->GetTensorShape();
|
||||
int64_t data_num = input_shape->NumElements();
|
||||
float eps = -1.0;
|
||||
AttrValue *attr = ctx.GetAttr("eps");
|
||||
if (attr != nullptr) {
|
||||
eps = attr->GetFloat();
|
||||
}
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U);
|
||||
}
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shared_less = [&](size_t start, size_t end) {
|
||||
T one = T(1);
|
||||
T zero = T(0);
|
||||
T up_bound = static_cast<T>(1) - static_cast<T>(eps);
|
||||
if (eps < 0) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T y_grad = input_y_grad[i];
|
||||
T x = input_x[i];
|
||||
output_x_grad[i] = (x < zero || x > one) ? std::numeric_limits<T>::quiet_NaN() : (y_grad / (x * (one - x)));
|
||||
}
|
||||
} else {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T y_grad = input_y_grad[i];
|
||||
T x = input_x[i];
|
||||
output_x_grad[i] =
|
||||
static_cast<float>(x) < static_cast<float>(eps) || static_cast<float>(x) > static_cast<float>(up_bound)
|
||||
? zero
|
||||
: (y_grad / (x * (one - x)));
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_less),
|
||||
"LogitGrad Compute failed.");
|
||||
} else {
|
||||
T one = T(1);
|
||||
T zero = T(0);
|
||||
T up_bound = static_cast<T>(1) - static_cast<T>(eps);
|
||||
if (eps < 0) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
T y_grad = input_y_grad[i];
|
||||
T x = input_x[i];
|
||||
output_x_grad[i] = (x < zero || x > one) ? std::numeric_limits<T>::quiet_NaN() : (y_grad / (x * (one - x)));
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
T y_grad = input_y_grad[i];
|
||||
T x = input_x[i];
|
||||
output_x_grad[i] =
|
||||
static_cast<float>(x) < static_cast<float>(eps) || static_cast<float>(x) > static_cast<float>(up_bound)
|
||||
? zero
|
||||
: (y_grad / (x * (one - x)));
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kLogitGrad, LogitGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,36 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LOGIT_GRAD_H
|
||||
#define AICPU_KERNELS_NORMALIZED_LOGIT_GRAD_H
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class LogitGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LogitGradCpuKernel() = default;
|
||||
~LogitGradCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t LogitGradCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,153 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "lower_bound.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 2;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kLowerBound = "LowerBound";
|
||||
|
||||
#define LOWERBOUND_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = LowerBoundCompute<TYPE1, TYPE2>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("LowerBound kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define LOWERBOUND_COMPUTE_CASE_ALL(TYPE, CTX) \
|
||||
LOWERBOUND_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
|
||||
LOWERBOUND_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
|
||||
LOWERBOUND_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
|
||||
LOWERBOUND_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
|
||||
LOWERBOUND_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
|
||||
LOWERBOUND_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
|
||||
LOWERBOUND_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
|
||||
LOWERBOUND_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
|
||||
LOWERBOUND_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t LowerBoundCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LowerBound check input and output number failed.");
|
||||
Tensor *sorted_x_data = ctx.Input(0);
|
||||
Tensor *values_data = ctx.Input(1);
|
||||
Tensor *output_data = ctx.Output(0);
|
||||
auto output_type = output_data->GetDataType();
|
||||
auto sorted_x_type = sorted_x_data->GetDataType();
|
||||
auto values_type = values_data->GetDataType();
|
||||
if (sorted_x_type != values_type) {
|
||||
KERNEL_LOG_ERROR("Input[0] data type[%s] must be same with Input[1] data type[%s]", DTypeStr(sorted_x_type).c_str(),
|
||||
DTypeStr(values_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
switch (output_type) {
|
||||
case DT_INT32:
|
||||
switch (sorted_x_type) {
|
||||
LOWERBOUND_COMPUTE_CASE_ALL(int32_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
case DT_INT64:
|
||||
switch (sorted_x_type) {
|
||||
LOWERBOUND_COMPUTE_CASE_ALL(int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Output data type[%s] not supported.", DTypeStr(output_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t LowerBoundCpuKernel::LowerBoundCompute(CpuKernelContext &ctx) {
|
||||
Tensor *sorted_x_data = ctx.Input(0);
|
||||
auto sorted_x_data_addr = reinterpret_cast<T1 *>(sorted_x_data->GetData());
|
||||
auto sorted_x_data_shape = sorted_x_data->GetTensorShape();
|
||||
std::vector<int64_t> sorted_x_data_shape_dims = sorted_x_data_shape->GetDimSizes();
|
||||
Tensor *values_data = ctx.Input(1);
|
||||
auto values_data_addr = reinterpret_cast<T1 *>(values_data->GetData());
|
||||
auto values_data_shape = values_data->GetTensorShape();
|
||||
int64_t values_data_num = values_data_shape->NumElements();
|
||||
std::vector<int64_t> values_data_shape_dims = values_data_shape->GetDimSizes();
|
||||
Tensor *output_data = ctx.Output(0);
|
||||
auto output_data_addr = reinterpret_cast<T2 *>(output_data->GetData());
|
||||
if (sorted_x_data_shape_dims[0] != values_data_shape_dims[0]) {
|
||||
KERNEL_LOG_ERROR("The number of rows of Input[0]:([%d]) should be consistent with that of Input[1]:([%d]).",
|
||||
sorted_x_data_shape_dims[0], values_data_shape_dims[0]);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
int64_t sorted_x_data_column = sorted_x_data_shape_dims[1];
|
||||
int64_t values_data_column = values_data_shape_dims[1];
|
||||
if (values_data_num < 1024) {
|
||||
for (int64_t i = 0; i < values_data_num; i++) {
|
||||
int64_t seq_row = i / values_data_column;
|
||||
int64_t low = seq_row * sorted_x_data_column;
|
||||
int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
|
||||
int64_t mid;
|
||||
while (low <= up) {
|
||||
mid = (low + up) / 2;
|
||||
if (values_data_addr[i] <= sorted_x_data_addr[mid]) {
|
||||
up = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
output_data_addr[i] = low - seq_row * sorted_x_data_column;
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t sum_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (sum_core_num > values_data_num) {
|
||||
sum_core_num = values_data_num;
|
||||
}
|
||||
auto shard_compute = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
int64_t seq_row = i / values_data_column;
|
||||
int64_t low = seq_row * sorted_x_data_column;
|
||||
int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
|
||||
int64_t mid;
|
||||
while (low <= up) {
|
||||
mid = (low + up) / 2;
|
||||
if (values_data_addr[i] <= sorted_x_data_addr[mid]) {
|
||||
up = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
output_data_addr[i] = low - seq_row * sorted_x_data_column;
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, values_data_num, values_data_num / sum_core_num, shard_compute),
|
||||
"LowerBound Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kLowerBound, LowerBoundCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LOWERBOUND_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_LOWERBOUND_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class LowerBoundCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LowerBoundCpuKernel() = default;
|
||||
~LowerBoundCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T1, typename T2>
|
||||
static uint32_t LowerBoundCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,115 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "lstsq.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include <Eigen/Dense>
|
||||
#include <Eigen/Cholesky>
|
||||
#include <iostream>
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kLstsq = "Lstsq";
|
||||
} // namespace
|
||||
// namespace aicpu
|
||||
namespace aicpu {
|
||||
uint32_t LstsqCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Lstsq check input and output number failed.");
|
||||
Tensor *input_x0 = ctx.Input(0);
|
||||
Tensor *input_x1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
auto dims_0 = input_x0->GetTensorShape()->GetDims();
|
||||
auto dims_1 = input_x1->GetTensorShape()->GetDims();
|
||||
KERNEL_CHECK_FALSE((dims_0 == 2), KERNEL_STATUS_PARAM_INVALID, "Dimension of input[0] must be 2, but got[%zu].",
|
||||
dims_0);
|
||||
KERNEL_CHECK_FALSE(((dims_1 == 2) || (dims_1 == 1)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Dimension of input[1] must be 2 or 1, but got[%zu].", dims_1);
|
||||
auto shape_0 = input_x0->GetTensorShape();
|
||||
auto shape_1 = input_x1->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((shape_0->GetDimSize(0) == shape_1->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Lstsq shape_0[0] and shape_1[0] not equal.", shape_0->GetDimSize(0), shape_0->GetDimSize(1));
|
||||
AttrValue *I2_regularizer = ctx.GetAttr("l2_regularizer");
|
||||
AttrValue *fast = ctx.GetAttr("fast");
|
||||
KERNEL_CHECK_NULLPTR(I2_regularizer, KERNEL_STATUS_PARAM_INVALID, "Get l2_regularizer failed.");
|
||||
KERNEL_CHECK_NULLPTR(fast, KERNEL_STATUS_PARAM_INVALID, "Get fast failed.");
|
||||
KERNEL_LOG_DEBUG(
|
||||
"LstsqCpuKernel[%s], inputx0: size[%llu];"
|
||||
"inputx1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_x0->GetDataSize(), input_x1->GetDataSize(), output->GetDataSize());
|
||||
DataType data_type1 = ctx.Input(0)->GetDataType();
|
||||
DataType data_type2 = ctx.Input(1)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((data_type1 == data_type2), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Lstsq input_0_dtype must be equal to input_1_dtype.", data_type1, data_type2);
|
||||
switch (data_type1) {
|
||||
case DT_FLOAT16:
|
||||
return LstsqCompute<float, Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return LstsqCompute<float, float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return LstsqCompute<double, double>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Lstsq kernel data type [%u] not support.", DTypeStr(data_type1).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t LstsqCpuKernel::LstsqCompute(CpuKernelContext &ctx) {
|
||||
Eigen::Index m = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
Eigen::Index n = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
|
||||
Eigen::Index k = 1;
|
||||
if (ctx.Input(1)->GetTensorShape()->GetDims() == 2) {
|
||||
k = ctx.Input(1)->GetTensorShape()->GetDimSize(1);
|
||||
}
|
||||
|
||||
typedef Eigen::Matrix<T1, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartixXd;
|
||||
MartixXd A(m, n);
|
||||
MartixXd B(m, k);
|
||||
|
||||
auto aptr = reinterpret_cast<T2 *>(ctx.Input(0)->GetData());
|
||||
auto bptr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
|
||||
|
||||
for (int i = 0; i < m * n; i++) {
|
||||
*(A.data() + i) = static_cast<T1>(*(aptr + i));
|
||||
}
|
||||
for (int i = 0; i < m * k; i++) {
|
||||
*(B.data() + i) = static_cast<T1>(*(bptr + i));
|
||||
}
|
||||
|
||||
MartixXd result(n, k);
|
||||
if (m >= n) {
|
||||
result = A.colPivHouseholderQr().solve(B);
|
||||
} else {
|
||||
MartixXd A_Transpose = A.transpose();
|
||||
MartixXd temp = A * A_Transpose;
|
||||
MartixXd tempI = temp.inverse();
|
||||
MartixXd x = A_Transpose * tempI;
|
||||
MartixXd output = x * B;
|
||||
result = output;
|
||||
}
|
||||
auto output_addr = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
|
||||
for (int i = 0; i < n * k; i++) {
|
||||
*(output_addr + i) = static_cast<T2>(*(result.data() + i));
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kLstsq, LstsqCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LSTSQ_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_LSTSQ_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class LstsqCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LstsqCpuKernel() = default;
|
||||
~LstsqCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T1, typename T2>
|
||||
static uint32_t LstsqCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,185 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "lu_solve.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include <Eigen/Dense>
|
||||
#include <iostream>
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 3;
|
||||
const int64_t kParallelBatchNum1 = 50;
|
||||
const int64_t kParallelBatchNum4 = 200;
|
||||
const int64_t kParallelBatchNum8 = 500;
|
||||
const int64_t kParallelBatchNumx = 1000;
|
||||
const char *kLuSolve = "LuSolve";
|
||||
} // namespace
|
||||
namespace aicpu {
|
||||
uint32_t LuSolveCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check LuSolve params failed.");
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input0 data failed.");
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
KERNEL_CHECK_NULLPTR(input_1->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input1 data failed.");
|
||||
Tensor *input_2 = ctx.Input(2);
|
||||
KERNEL_CHECK_NULLPTR(input_2->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input2 data failed.");
|
||||
Tensor *output = ctx.Output(0);
|
||||
auto input_0_Shape = input_0->GetTensorShape();
|
||||
KERNEL_CHECK_NULLPTR(input_0_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_0_Shape failed.")
|
||||
auto input_1_Shape = input_1->GetTensorShape();
|
||||
KERNEL_CHECK_NULLPTR(input_1_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_1_Shape failed.")
|
||||
auto input_2_Shape = input_2->GetTensorShape();
|
||||
KERNEL_CHECK_NULLPTR(input_2_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_2_Shape failed.")
|
||||
int32_t b_dims = input_0_Shape->GetDims();
|
||||
int32_t lu_dims = input_1_Shape->GetDims();
|
||||
int32_t pivots_dims = input_2_Shape->GetDims();
|
||||
std::vector<int64_t> b_dims_vector = input_0_Shape->GetDimSizes();
|
||||
std::vector<int64_t> lu_dims_vector = input_1_Shape->GetDimSizes();
|
||||
std::vector<int64_t> pivots_dims_vector = input_2_Shape->GetDimSizes();
|
||||
if (b_dims == lu_dims) {
|
||||
for (int32_t i = 0; i <= b_dims - 2; i++) {
|
||||
if (b_dims_vector[i] != lu_dims_vector[i]) {
|
||||
KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
} else if (lu_dims > b_dims) {
|
||||
for (int32_t i = 0; i < b_dims - 2; i++) {
|
||||
if (b_dims_vector[i] != lu_dims_vector[lu_dims - b_dims + i]) {
|
||||
KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int32_t i = 0; i < lu_dims - 2; i++) {
|
||||
if (lu_dims_vector[i] != b_dims_vector[b_dims - lu_dims + i]) {
|
||||
KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int32_t i = 0; i < pivots_dims; i++) {
|
||||
if (lu_dims_vector[i] != pivots_dims_vector[i]) {
|
||||
KERNEL_LOG_ERROR("batch dimension of LU_pivots doesn't match batch dimension of LU_data!");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
KERNEL_LOG_DEBUG(
|
||||
"LuSolveCpuKernel[%s], input_0: size[%llu], input_1: size[%llu], input_2: size[%llu]"
|
||||
"output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), input_2->GetDataSize(),
|
||||
output->GetDataSize());
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
return LuSolveCompute<float, float>(ctx);
|
||||
case DT_FLOAT16:
|
||||
return LuSolveCompute<float, Eigen::half>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("LuSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, typename T2>
|
||||
uint32_t LuSolveCpuKernel::LuSolve(CpuKernelContext &ctx, T *b_working_ptr, T *lu_working_ptr,
|
||||
int32_t *pivots_working_ptr, int64_t b_stride, int64_t a) {
|
||||
auto output_y = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
|
||||
auto input_0_Shape = ctx.Input(0)->GetTensorShape();
|
||||
auto input_1_Shape = ctx.Input(1)->GetTensorShape();
|
||||
int32_t lu_dims = input_1_Shape->GetDims();
|
||||
int64_t lu_maxtrix_sizes = input_1_Shape->GetDimSize(lu_dims - 2);
|
||||
int32_t b_dim = input_0_Shape->GetDims();
|
||||
int64_t b_m = input_0_Shape->GetDimSize(b_dim - 1);
|
||||
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
|
||||
MatrixXd matrix_b = Eigen::Map<MatrixXd>(b_working_ptr, lu_maxtrix_sizes, b_m);
|
||||
MatrixXd matrix_A = Eigen::Map<MatrixXd>(lu_working_ptr, lu_maxtrix_sizes, lu_maxtrix_sizes);
|
||||
for (int64_t i = 0; i < input_0_Shape->GetDimSize(b_dim - 2); i++) {
|
||||
matrix_b.row(i).swap(matrix_b.row(*(pivots_working_ptr + i) - 1));
|
||||
}
|
||||
MatrixXd L = matrix_A.template triangularView<Eigen::UnitLower>();
|
||||
MatrixXd U = matrix_A.template triangularView<Eigen::Upper>();
|
||||
MatrixXd result = (L * U).lu().solve(matrix_b);
|
||||
for (int64_t m = 0; m < b_stride; m++) {
|
||||
*(output_y + a * b_stride + m) = (T2) * (result.data() + m);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, typename T2>
|
||||
uint32_t LuSolveCpuKernel::LuSolveCompute(CpuKernelContext &ctx) {
|
||||
auto input_x0 = reinterpret_cast<T2 *>(ctx.Input(0)->GetData());
|
||||
auto input_x1 = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
|
||||
auto input_x2 = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
|
||||
auto input_0_Shape = ctx.Input(0)->GetTensorShape();
|
||||
auto input_1_Shape = ctx.Input(1)->GetTensorShape();
|
||||
auto input_2_Shape = ctx.Input(2)->GetTensorShape();
|
||||
T *input_0 = new T[input_0_Shape->NumElements()];
|
||||
T *input_1 = new T[input_1_Shape->NumElements()];
|
||||
for (int64_t i = 0; i < input_0_Shape->NumElements(); i++) {
|
||||
*(input_0 + i) = (T) * (input_x0 + i);
|
||||
}
|
||||
for (int64_t i = 0; i < input_1_Shape->NumElements(); i++) {
|
||||
*(input_1 + i) = (T) * (input_x1 + i);
|
||||
}
|
||||
int32_t b_dims = input_0_Shape->GetDims();
|
||||
int32_t lu_dims = input_1_Shape->GetDims();
|
||||
std::vector<int64_t> b_dims_vector = input_0_Shape->GetDimSizes();
|
||||
std::vector<int64_t> lu_dims_vector = input_1_Shape->GetDimSizes();
|
||||
std::vector<int64_t> pivots_dims_vector = input_2_Shape->GetDimSizes();
|
||||
int64_t b_stride = input_0_Shape->GetDimSize(b_dims - 1) * input_0_Shape->GetDimSize(b_dims - 2);
|
||||
int64_t lu_stride = input_1_Shape->GetDimSize(lu_dims - 1) * input_1_Shape->GetDimSize(lu_dims - 2);
|
||||
int64_t pivots_stride = input_1_Shape->GetDimSize(lu_dims - 1);
|
||||
std::vector<int64_t> b_shape = b_dims_vector;
|
||||
std::vector<int64_t> lu_shape = lu_dims_vector;
|
||||
for (size_t i = 0; i < 2; i++) {
|
||||
b_shape.pop_back();
|
||||
lu_shape.pop_back();
|
||||
}
|
||||
Bcast bcast(b_shape, lu_shape);
|
||||
int64_t batch_num = ctx.Output(0)->NumElements() / b_stride;
|
||||
if (batch_num < kParallelBatchNum1) {
|
||||
for (int64_t i = 0; i < batch_num; i++) {
|
||||
T *b_working_ptr = &input_0[bcast.GetBroadcastXIndex(i) * b_stride];
|
||||
T *lu_working_ptr = &input_1[bcast.GetBroadcastYIndex(i) * lu_stride];
|
||||
int32_t *pivots_working_ptr = &input_x2[bcast.GetBroadcastYIndex(i) * pivots_stride];
|
||||
LuSolve<T, T2>(ctx, b_working_ptr, lu_working_ptr, pivots_working_ptr, b_stride, i);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (batch_num < kParallelBatchNumx) max_core_num = 8U;
|
||||
if (batch_num < kParallelBatchNum8) max_core_num = 4U;
|
||||
if (batch_num < kParallelBatchNum4) max_core_num = 2U;
|
||||
std::cout << max_core_num << std::endl;
|
||||
auto sharder = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
T *b_working_ptr = &input_0[bcast.GetBroadcastXIndex(i) * b_stride];
|
||||
T *lu_working_ptr = &input_1[bcast.GetBroadcastYIndex(i) * lu_stride];
|
||||
int32_t *pivots_working_ptr = &input_x2[bcast.GetBroadcastYIndex(i) * pivots_stride];
|
||||
LuSolve<T, T2>(ctx, b_working_ptr, lu_working_ptr, pivots_working_ptr, b_stride, i);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_num, batch_num / max_core_num, sharder),
|
||||
"LuSolve Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kLuSolve, LuSolveCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,22 @@
|
|||
#ifndef AICPU_KERNELS_NORMALIZED_LUSOLVE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_LUSOLVE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
namespace aicpu {
|
||||
|
||||
class LuSolveCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LuSolveCpuKernel() = default;
|
||||
~LuSolveCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T, typename T2>
|
||||
static uint32_t LuSolve(CpuKernelContext &ctx, T *b_working_ptr, T *lu_working_ptr, int32_t *pivots_working_ptr,
|
||||
int64_t b_stride, int64_t i);
|
||||
template <typename T, typename T2>
|
||||
static uint32_t LuSolveCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,321 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "lu_unpack.h"
|
||||
#include <string.h>
|
||||
#include <Eigen/Dense>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include "cpu_context.h"
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_tensor.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 3;
|
||||
const uint32_t kInputNum = 2;
|
||||
const uint32_t kFirstInputIndex = 0;
|
||||
const uint32_t kSecondInputIndex = 1;
|
||||
const uint32_t kFirstOutputIndex = 0;
|
||||
const uint32_t kSecondOutputIndex = 1;
|
||||
const uint32_t kThirdOutputIndex = 2;
|
||||
const int32_t kLuDataMinRank = 2;
|
||||
const int32_t kLuPivotsMinRank = 2;
|
||||
const int64_t kParallelBatchNum = 70;
|
||||
const char *kLuUnpack = "LuUnpack";
|
||||
} // namespace
|
||||
namespace aicpu {
|
||||
template <typename T_data, typename T_pivots>
|
||||
uint32_t LuUnpackCpuKernel::LuUnpack(CpuKernelContext &ctx, T_pivots *Lu_pivots_working_ptr, int64_t matrix_index,
|
||||
T_data *P_eye) {
|
||||
int32_t Lu_data_dims = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDims();
|
||||
int64_t Lu_data_dim1 = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSize(Lu_data_dims - 2);
|
||||
int64_t Lu_data_dim2 = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSize(Lu_data_dims - 1);
|
||||
int32_t Lu_pivots_dims = ctx.Input(kSecondInputIndex)->GetTensorShape()->GetDims();
|
||||
int64_t Lu_pivots_dim = ctx.Input(kSecondInputIndex)->GetTensorShape()->GetDimSize(Lu_pivots_dims - 1);
|
||||
int64_t matrix_width = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes()[Lu_data_dims - 2];
|
||||
int64_t matrix_height = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes()[Lu_data_dims - 1];
|
||||
int64_t pivots_stride = Lu_data_dim1 * Lu_data_dim1;
|
||||
int64_t L_stride = 0;
|
||||
int64_t U_stride = 0;
|
||||
if (Lu_data_dim1 > Lu_data_dim2) {
|
||||
L_stride = Lu_data_dim1 * Lu_data_dim2;
|
||||
U_stride = Lu_data_dim2 * Lu_data_dim2;
|
||||
} else {
|
||||
L_stride = Lu_data_dim1 * Lu_data_dim1;
|
||||
U_stride = Lu_data_dim1 * Lu_data_dim2;
|
||||
}
|
||||
int64_t matrix_size = matrix_width * matrix_height;
|
||||
using MatrixMap = Eigen::Map<Eigen::Matrix<T_data, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
|
||||
MatrixMap input(reinterpret_cast<T_data *>(ctx.Input(kFirstInputIndex)->GetData()) + matrix_index * matrix_size,
|
||||
matrix_width, matrix_height);
|
||||
// Triu
|
||||
if (matrix_width > matrix_height) {
|
||||
MatrixMap output2(reinterpret_cast<T_data *>(ctx.Output(kThirdOutputIndex)->GetData()) + matrix_index * U_stride,
|
||||
matrix_height, matrix_height);
|
||||
T_data *MiddlePtr = new T_data[matrix_size];
|
||||
MatrixMap MiddleData(MiddlePtr, matrix_width, matrix_height);
|
||||
MiddleData = input.template triangularView<Eigen::Upper>();
|
||||
output2 = MiddleData.block(0, 0, matrix_height, matrix_height);
|
||||
delete[] MiddlePtr;
|
||||
} else {
|
||||
MatrixMap output2(reinterpret_cast<T_data *>(ctx.Output(kThirdOutputIndex)->GetData()) + matrix_index * U_stride,
|
||||
matrix_width, matrix_height);
|
||||
output2 = input.template triangularView<Eigen::Upper>();
|
||||
}
|
||||
// Tril
|
||||
if (matrix_height > matrix_width) {
|
||||
MatrixMap output1(reinterpret_cast<T_data *>(ctx.Output(kSecondOutputIndex)->GetData()) + matrix_index * L_stride,
|
||||
matrix_width, matrix_width);
|
||||
T_data *MiddlePtr = new T_data[matrix_size];
|
||||
MatrixMap MiddleData(MiddlePtr, matrix_width, matrix_height);
|
||||
MiddleData = input.template triangularView<Eigen::UnitLower>();
|
||||
output1 = MiddleData.block(0, 0, matrix_width, matrix_width);
|
||||
delete[] MiddlePtr;
|
||||
} else {
|
||||
MatrixMap output1(reinterpret_cast<T_data *>(ctx.Output(kSecondOutputIndex)->GetData()) + matrix_index * L_stride,
|
||||
matrix_width, matrix_height);
|
||||
output1 = input.template triangularView<Eigen::UnitLower>();
|
||||
}
|
||||
// Swap
|
||||
std::vector<T_pivots> final_order;
|
||||
final_order.resize(Lu_data_dim1);
|
||||
for (int i = 0; i < Lu_data_dim1; i++) {
|
||||
final_order[i] = T_pivots(i);
|
||||
}
|
||||
for (T_pivots id = 0; id < Lu_pivots_dim; id++) {
|
||||
int64_t perm_id = 0;
|
||||
int64_t perm_pivots_id = 0;
|
||||
for (int64_t i = 0; i < Lu_data_dim1; i++) {
|
||||
if (id == final_order[i]) {
|
||||
perm_id = i;
|
||||
}
|
||||
if (!((*(Lu_pivots_working_ptr + id) <= Lu_data_dim1) && (*(Lu_pivots_working_ptr + id) >= 1))) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if ((*(Lu_pivots_working_ptr + id) - 1) == final_order[i]) {
|
||||
perm_pivots_id = i;
|
||||
}
|
||||
}
|
||||
std::swap(final_order[perm_id], final_order[perm_pivots_id]);
|
||||
}
|
||||
// Index_select
|
||||
auto output_y0 = reinterpret_cast<T_data *>(ctx.Output(kFirstOutputIndex)->GetData());
|
||||
int64_t indices_num = final_order.size();
|
||||
int64_t inner_size = Lu_data_dim1;
|
||||
int64_t slice_size = inner_size * sizeof(T_data);
|
||||
for (int64_t j = 0; j < indices_num; ++j) {
|
||||
auto params_idx = final_order[j] * inner_size;
|
||||
auto out_idx = j * inner_size;
|
||||
memcpy(output_y0 + matrix_index * pivots_stride + out_idx, P_eye + params_idx, slice_size);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T_data, typename T_pivots>
|
||||
uint32_t LuUnpackCpuKernel::LuUnpackCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(kFirstInputIndex);
|
||||
Tensor *input1_tensor = ctx.Input(kSecondInputIndex);
|
||||
auto input_0_Shape = input0_tensor->GetTensorShape();
|
||||
auto input_1_Shape = input1_tensor->GetTensorShape();
|
||||
int32_t Lu_data_dims = input_0_Shape->GetDims();
|
||||
int64_t Lu_data_dim1 = input_0_Shape->GetDimSize(Lu_data_dims - 2);
|
||||
int64_t Lu_data_dim2 = input_0_Shape->GetDimSize(Lu_data_dims - 1);
|
||||
int32_t Lu_pivots_dims = input_1_Shape->GetDims();
|
||||
int64_t Lu_pivots_dim = input_1_Shape->GetDimSize(Lu_pivots_dims - 1);
|
||||
auto input_dim_size = input_0_Shape->GetDimSizes();
|
||||
auto input_x1 = reinterpret_cast<T_pivots *>(input1_tensor->GetData());
|
||||
|
||||
int32_t block_size = Lu_data_dim1 * Lu_data_dim1;
|
||||
T_data *P_eye = new T_data[block_size]{};
|
||||
T_data num = static_cast<T_data>(1);
|
||||
for (int32_t i = 0; i < Lu_data_dim1; i++) {
|
||||
*(P_eye + (Lu_data_dim1 + 1) * i) = num;
|
||||
}
|
||||
uint32_t check_status = 0;
|
||||
int64_t Lu_data_stride = Lu_data_dim1 * Lu_data_dim2;
|
||||
int64_t Lu_pivots_stride = Lu_pivots_dim;
|
||||
int64_t batch_num = ctx.Input(0)->NumElements() / Lu_data_stride;
|
||||
if (batch_num < kParallelBatchNum || Lu_data_dims == kLuDataMinRank) {
|
||||
for (int64_t matrix_index = 0; matrix_index < batch_num; matrix_index++) {
|
||||
T_pivots *Lu_pivots_working_ptr = input_x1 + matrix_index * Lu_pivots_stride;
|
||||
check_status = LuUnpack(ctx, Lu_pivots_working_ptr, matrix_index, P_eye);
|
||||
if (check_status == KERNEL_STATUS_PARAM_INVALID) {
|
||||
return check_status;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > batch_num) {
|
||||
max_core_num = batch_num;
|
||||
}
|
||||
uint32_t parallel_status = 0;
|
||||
auto sharder = [&](int64_t start, int64_t end) {
|
||||
for (int64_t matrix_index = start; matrix_index < end; matrix_index++) {
|
||||
T_pivots *Lu_pivots_working_ptr = input_x1 + matrix_index * Lu_pivots_stride;
|
||||
if (LuUnpack(ctx, Lu_pivots_working_ptr, matrix_index, P_eye) == KERNEL_STATUS_OK) {
|
||||
parallel_status = KERNEL_STATUS_OK;
|
||||
} else {
|
||||
parallel_status = KERNEL_STATUS_PARAM_INVALID;
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_num, batch_num / max_core_num, sharder),
|
||||
"LuUnpack Compute failed.");
|
||||
if (parallel_status != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
delete[] P_eye;
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
void LuUnpackCpuKernel::SetMap() {
|
||||
calls_[DT_INT8][DT_INT8] = LuUnpackCompute<int8_t, int8_t>;
|
||||
calls_[DT_INT8][DT_UINT8] = LuUnpackCompute<int8_t, uint8_t>;
|
||||
calls_[DT_INT8][DT_INT16] = LuUnpackCompute<int8_t, int16_t>;
|
||||
calls_[DT_INT8][DT_INT32] = LuUnpackCompute<int8_t, int32_t>;
|
||||
calls_[DT_INT8][DT_INT64] = LuUnpackCompute<int8_t, int64_t>;
|
||||
|
||||
calls_[DT_INT16][DT_INT8] = LuUnpackCompute<int16_t, int8_t>;
|
||||
calls_[DT_INT16][DT_INT16] = LuUnpackCompute<int16_t, int16_t>;
|
||||
calls_[DT_INT16][DT_INT32] = LuUnpackCompute<int16_t, int32_t>;
|
||||
calls_[DT_INT16][DT_INT64] = LuUnpackCompute<int16_t, int64_t>;
|
||||
calls_[DT_INT16][DT_UINT8] = LuUnpackCompute<int16_t, uint8_t>;
|
||||
|
||||
calls_[DT_INT32][DT_INT8] = LuUnpackCompute<int32_t, int8_t>;
|
||||
calls_[DT_INT32][DT_INT16] = LuUnpackCompute<int32_t, int16_t>;
|
||||
calls_[DT_INT32][DT_INT32] = LuUnpackCompute<int32_t, int32_t>;
|
||||
calls_[DT_INT32][DT_INT64] = LuUnpackCompute<int32_t, int64_t>;
|
||||
calls_[DT_INT32][DT_UINT8] = LuUnpackCompute<int32_t, uint8_t>;
|
||||
|
||||
calls_[DT_INT64][DT_INT8] = LuUnpackCompute<int64_t, int8_t>;
|
||||
calls_[DT_INT64][DT_INT16] = LuUnpackCompute<int64_t, int16_t>;
|
||||
calls_[DT_INT64][DT_INT32] = LuUnpackCompute<int64_t, int32_t>;
|
||||
calls_[DT_INT64][DT_INT64] = LuUnpackCompute<int64_t, int64_t>;
|
||||
calls_[DT_INT64][DT_UINT8] = LuUnpackCompute<int64_t, uint8_t>;
|
||||
|
||||
calls_[DT_FLOAT16][DT_INT8] = LuUnpackCompute<Eigen::half, int8_t>;
|
||||
calls_[DT_FLOAT16][DT_INT16] = LuUnpackCompute<Eigen::half, int16_t>;
|
||||
calls_[DT_FLOAT16][DT_INT32] = LuUnpackCompute<Eigen::half, int32_t>;
|
||||
calls_[DT_FLOAT16][DT_INT64] = LuUnpackCompute<Eigen::half, int64_t>;
|
||||
calls_[DT_FLOAT16][DT_UINT8] = LuUnpackCompute<Eigen::half, uint8_t>;
|
||||
|
||||
calls_[DT_FLOAT][DT_INT8] = LuUnpackCompute<float, int8_t>;
|
||||
calls_[DT_FLOAT][DT_INT16] = LuUnpackCompute<float, int16_t>;
|
||||
calls_[DT_FLOAT][DT_INT32] = LuUnpackCompute<float, int32_t>;
|
||||
calls_[DT_FLOAT][DT_INT64] = LuUnpackCompute<float, int64_t>;
|
||||
calls_[DT_FLOAT][DT_UINT8] = LuUnpackCompute<float, uint8_t>;
|
||||
|
||||
calls_[DT_DOUBLE][DT_INT8] = LuUnpackCompute<double, int8_t>;
|
||||
calls_[DT_DOUBLE][DT_INT16] = LuUnpackCompute<double, int16_t>;
|
||||
calls_[DT_DOUBLE][DT_INT32] = LuUnpackCompute<double, int32_t>;
|
||||
calls_[DT_DOUBLE][DT_INT64] = LuUnpackCompute<double, int64_t>;
|
||||
calls_[DT_DOUBLE][DT_UINT8] = LuUnpackCompute<double, uint8_t>;
|
||||
|
||||
calls_[DT_UINT8][DT_INT8] = LuUnpackCompute<uint8_t, int8_t>;
|
||||
calls_[DT_UINT8][DT_INT16] = LuUnpackCompute<uint8_t, int16_t>;
|
||||
calls_[DT_UINT8][DT_INT32] = LuUnpackCompute<uint8_t, int32_t>;
|
||||
calls_[DT_UINT8][DT_INT64] = LuUnpackCompute<uint8_t, int64_t>;
|
||||
calls_[DT_UINT8][DT_UINT8] = LuUnpackCompute<uint8_t, uint8_t>;
|
||||
}
|
||||
|
||||
uint32_t LuUnpackCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LuUnpack check input and output number failed.");
|
||||
Tensor *LU_data_ = ctx.Input(0);
|
||||
Tensor *LU_pivots_ = ctx.Input(1);
|
||||
std::shared_ptr<TensorShape> LU_data_shape = LU_data_->GetTensorShape();
|
||||
std::shared_ptr<TensorShape> LU_pivots_shape = LU_pivots_->GetTensorShape();
|
||||
int32_t LU_data_rank = LU_data_shape->GetDims();
|
||||
if (LU_data_rank < kLuDataMinRank) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The input dim size of LU_data must be at least 2-D, "
|
||||
"while %d",
|
||||
LU_data_rank);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
int32_t Lu_data_dims = LU_data_shape->GetDims();
|
||||
int64_t Lu_data_dim1 = LU_data_shape->GetDimSize(Lu_data_dims - 2);
|
||||
int64_t Lu_data_dim2 = LU_data_shape->GetDimSize(Lu_data_dims - 1);
|
||||
int32_t Lu_pivots_dims = LU_pivots_shape->GetDims();
|
||||
int64_t Lu_pivots_dim = LU_pivots_shape->GetDimSize(Lu_pivots_dims - 1);
|
||||
if (Lu_pivots_dim != std::min(Lu_data_dim1, Lu_data_dim2)) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The last dimension of LU_pivots must be the same as the minimum value "
|
||||
"of the last two dimensions of LU_data, "
|
||||
"but got The last dimension of LU_pivots [%d], the minimum value of "
|
||||
"the last two dimensions of LU_data: [%d]",
|
||||
Lu_pivots_dim, std::min(Lu_data_dim1, Lu_data_dim2));
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
for (int32_t i = 0; i < Lu_pivots_dims - 1; i++) {
|
||||
if (LU_data_shape->GetDimSize(i) != LU_pivots_shape->GetDimSize(i)) {
|
||||
KERNEL_LOG_ERROR(
|
||||
" LU_data's batch dimensions does not match LU_pivots's batch "
|
||||
"dimensions.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
DataType LU_data_dtype = static_cast<DataType>(LU_data_->GetDataType());
|
||||
bool LU_data_dtype_flag = LU_data_dtype != DT_FLOAT16 && LU_data_dtype != DT_FLOAT && LU_data_dtype != DT_DOUBLE &&
|
||||
LU_data_dtype != DT_INT8 && LU_data_dtype != DT_UINT8 && LU_data_dtype != DT_INT16 &&
|
||||
LU_data_dtype != DT_INT32 && LU_data_dtype != DT_INT64;
|
||||
if (LU_data_dtype_flag) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"Op LuUnpack first input LU_data_type's data type should be of the "
|
||||
"follows: "
|
||||
"DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT16, "
|
||||
"DT_FLOAT, DT_DOUBLE, "
|
||||
"but this type is [%s].",
|
||||
DTypeStr(LU_data_dtype).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
DataType LU_pivots_dtype = static_cast<DataType>(LU_pivots_->GetDataType());
|
||||
bool LU_pivots_dtype_flag = LU_pivots_dtype != DT_INT8 && LU_pivots_dtype != DT_UINT8 &&
|
||||
LU_pivots_dtype != DT_INT16 && LU_pivots_dtype != DT_INT32 && LU_pivots_dtype != DT_INT64;
|
||||
if (LU_pivots_dtype_flag) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"Op LuUnpack second input LU_pivots_type's data type should be of the "
|
||||
"follows: "
|
||||
"DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, "
|
||||
"but this type is [%s].",
|
||||
DTypeStr(LU_pivots_dtype).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
SetMap();
|
||||
std::vector<DataType> LU_data_type_vec = {DT_INT8, DT_UINT8, DT_INT16, DT_INT32,
|
||||
DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE};
|
||||
std::vector<DataType> LU_pivots_type_vec = {DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64};
|
||||
for (uint64_t i = 0; i < LU_data_type_vec.size(); i++) {
|
||||
for (uint64_t j = 0; j < LU_pivots_type_vec.size(); j++) {
|
||||
if (LU_data_dtype == LU_data_type_vec[i] && LU_pivots_dtype == LU_pivots_type_vec[j]) {
|
||||
KERNEL_HANDLE_ERROR(calls_[LU_data_type_vec[i]][LU_pivots_type_vec[j]](ctx),
|
||||
"The elements of LU_pivots must be greater than 1 "
|
||||
"and be less than the size of LU_pivots's last dimension.");
|
||||
}
|
||||
}
|
||||
}
|
||||
calls_.clear();
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kLuUnpack, LuUnpackCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LUUNPACK_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_LUUNPACK_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
namespace aicpu {
|
||||
class LuUnpackCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LuUnpackCpuKernel() = default;
|
||||
~LuUnpackCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T_data, typename T_pivots>
|
||||
static uint32_t LuUnpack(CpuKernelContext &ctx, T_pivots *Lu_pivots_working_ptr, int64_t matrix_index, T_data *P_eye);
|
||||
template <typename T_data, typename T_pivots>
|
||||
static uint32_t LuUnpackCompute(CpuKernelContext &ctx);
|
||||
template <typename T_pivots>
|
||||
static uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
|
||||
std::map<int, std::map<int, std::function<uint32_t(CpuKernelContext &)>>> calls_;
|
||||
void SetMap();
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,183 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "lu_unpack_grad.h"
|
||||
#include <iostream>
|
||||
#include "Eigen/Core"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "securec.h"
|
||||
#include "status.h"
|
||||
#include "utils/broadcast_iterator.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kLuUnpackGrad = "LuUnpackGrad";
|
||||
const int64_t kParallelBatchNum = 30;
|
||||
const uint32_t kInputNum = 3;
|
||||
const uint32_t kOutputNum = 2;
|
||||
const uint32_t kInputFirst = 0;
|
||||
const uint32_t kInputSecond = 1;
|
||||
const uint32_t kInputThird = 2;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t LuUnpackGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Lu Unpack Grad check input and output number failed.");
|
||||
// choose compute function depend on dataType
|
||||
auto input_type = static_cast<DataType>(ctx.Input(kInputThird)->GetDataType());
|
||||
switch (input_type) {
|
||||
case DT_FLOAT16:
|
||||
return LuUnpackGradCompute<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return LuUnpackGradCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return LuUnpackGradCompute<double>(ctx);
|
||||
case DT_INT8:
|
||||
return LuUnpackGradCompute<int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return LuUnpackGradCompute<int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return LuUnpackGradCompute<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return LuUnpackGradCompute<int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return LuUnpackGradCompute<uint8_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LuUnpackGradCpuKernel::TriLU(CpuKernelContext &ctx, Tensor *L_grad_output, Tensor *U_grad_output, int64_t a) {
|
||||
Tensor *L_grad = NULL;
|
||||
Tensor *U_grad = NULL;
|
||||
Tensor *LU_data = NULL;
|
||||
L_grad = ctx.Input(kInputFirst);
|
||||
U_grad = ctx.Input(kInputSecond);
|
||||
LU_data = ctx.Input(kInputThird);
|
||||
auto LU_data_shape = LU_data->GetTensorShape();
|
||||
int32_t LU_data_dims = LU_data_shape->GetDims();
|
||||
int64_t LU_data_height = LU_data_shape->GetDimSize(LU_data_dims - 2);
|
||||
int64_t LU_data_width = LU_data_shape->GetDimSize(LU_data_dims - 1);
|
||||
auto LU_dim_min = std::min(LU_data_height, LU_data_width);
|
||||
auto input_U_shape = U_grad->GetTensorShape();
|
||||
auto input_U_dim_size = input_U_shape->GetDimSizes();
|
||||
auto input_U_dims = input_U_shape->GetDims();
|
||||
int64_t matrix_U_width = input_U_dim_size[input_U_dims - 2];
|
||||
int64_t matrix_U_height = input_U_dim_size[input_U_dims - 1];
|
||||
int64_t matrix_U_size = matrix_U_width * matrix_U_height;
|
||||
auto input_L_shape = L_grad->GetTensorShape();
|
||||
auto input_L_dim_size = input_L_shape->GetDimSizes();
|
||||
using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
|
||||
auto input_L_dims = input_L_shape->GetDims();
|
||||
int64_t matrix_L_width = input_L_dim_size[input_L_dims - 2];
|
||||
int64_t matrix_L_height = input_L_dim_size[input_L_dims - 1];
|
||||
int64_t matrix_L_size = matrix_L_width * matrix_L_height;
|
||||
int64_t output_stride = LU_data_height * LU_data_width;
|
||||
|
||||
MatrixMap input_L(reinterpret_cast<T *>(L_grad->GetData()) + a * matrix_L_size, matrix_L_width, matrix_L_height);
|
||||
MatrixMap input_U(reinterpret_cast<T *>(U_grad->GetData()) + a * matrix_U_size, matrix_U_width, matrix_U_height);
|
||||
if (LU_data_width > LU_data_height) {
|
||||
MatrixMap output_L(reinterpret_cast<T *>(L_grad_output->GetData()) + a * output_stride, LU_data_height,
|
||||
LU_data_width);
|
||||
T *MiddlePtr = new T[matrix_L_size];
|
||||
MatrixMap MiddleData(MiddlePtr, matrix_L_width, matrix_L_height);
|
||||
MiddleData = input_L.template triangularView<Eigen::StrictlyLower>();
|
||||
for (auto i = 0; i < LU_data_height; i++) {
|
||||
for (auto j = 0; j < LU_dim_min; j++) {
|
||||
output_L(i, j) = MiddleData(i, j);
|
||||
}
|
||||
}
|
||||
delete[] MiddlePtr;
|
||||
} else {
|
||||
MatrixMap output_L(reinterpret_cast<T *>(L_grad_output->GetData()) + a * output_stride, LU_data_height,
|
||||
LU_data_width);
|
||||
output_L = input_L.template triangularView<Eigen::StrictlyLower>();
|
||||
}
|
||||
if (LU_data_height > LU_data_width) {
|
||||
MatrixMap output_U(reinterpret_cast<T *>(U_grad_output->GetData()) + a * output_stride, LU_data_height,
|
||||
LU_data_width);
|
||||
T *MiddlePtr = new T[matrix_U_size];
|
||||
MatrixMap MiddleData(MiddlePtr, matrix_U_width, matrix_U_height);
|
||||
MiddleData = input_U.template triangularView<Eigen::Upper>();
|
||||
for (auto i = 0; i < LU_dim_min; i++) {
|
||||
for (auto j = i; j < LU_data_width; j++) {
|
||||
output_U(i, j) = MiddleData(i, j);
|
||||
}
|
||||
}
|
||||
delete[] MiddlePtr;
|
||||
} else {
|
||||
MatrixMap output_U(reinterpret_cast<T *>(U_grad_output->GetData()) + a * output_stride, LU_data_height,
|
||||
LU_data_width);
|
||||
output_U = input_U.template triangularView<Eigen::Upper>();
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LuUnpackGradCpuKernel::LuUnpackGradCompute(CpuKernelContext &ctx) {
|
||||
Tensor *LU_data = NULL;
|
||||
Tensor *L_grad_output = NULL;
|
||||
Tensor *U_grad_output = NULL;
|
||||
LU_data = ctx.Input(kInputThird);
|
||||
L_grad_output = ctx.Output(0);
|
||||
U_grad_output = ctx.Output(1);
|
||||
|
||||
auto LU_data_shape = LU_data->GetTensorShape();
|
||||
int32_t LU_data_dims = LU_data_shape->GetDims();
|
||||
int64_t LU_data_elem_num = LU_data->NumElements();
|
||||
|
||||
int64_t LU_data_height = LU_data_shape->GetDimSize(LU_data_dims - 2);
|
||||
int64_t LU_data_width = LU_data_shape->GetDimSize(LU_data_dims - 1);
|
||||
int64_t LU_data_stride = LU_data_height * LU_data_width;
|
||||
int64_t matrix_num = LU_data_elem_num / LU_data_stride;
|
||||
|
||||
auto L_grad_output_data = reinterpret_cast<T *>(L_grad_output->GetData());
|
||||
auto U_grad_output_data = reinterpret_cast<T *>(U_grad_output->GetData());
|
||||
for (auto i = 0; i < LU_data_elem_num; i++) {
|
||||
*(L_grad_output_data + i) = static_cast<T>(0);
|
||||
*(U_grad_output_data + i) = static_cast<T>(0);
|
||||
}
|
||||
if (matrix_num < kParallelBatchNum) {
|
||||
for (auto i = 0; i < matrix_num; i++) {
|
||||
TriLU<T>(ctx, L_grad_output, U_grad_output, i);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > matrix_num) {
|
||||
max_core_num = matrix_num;
|
||||
}
|
||||
auto sharder = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
TriLU<T>(ctx, L_grad_output, U_grad_output, i);
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharder),
|
||||
"LuUnpackGrad Compute failed.");
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kLuUnpackGrad, LuUnpackGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_LU_UNPACK_GRAD_H_
|
||||
#define AICPU_KERNELS_LU_UNPACK_GRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class LuUnpackGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~LuUnpackGradCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief compute for all types
|
||||
* @param ctx cpu kernel context
|
||||
* @return status if success
|
||||
*/
|
||||
template <typename T>
|
||||
uint32_t LuUnpackGradCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t TriLU(CpuKernelContext &ctx, Tensor *L_grad_output, Tensor *U_grad_output, int64_t a);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,179 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "matmul.h"
|
||||
|
||||
#include <complex>
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
|
||||
#include "utils/kernel_util.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const char *kMatmul = "MatMul";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
uint32_t MatMulCpuKernel::AddCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
auto in2 = reinterpret_cast<T *>(ctx.Input(2)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
auto input1 = in2 + bcast.GetBroadcastXIndex(i); // i-th value of input0
|
||||
auto input2 = out + bcast.GetBroadcastYIndex(i); // i-th value of input1
|
||||
*(out + i) = (*input1) + (*input2);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatMulCpuKernel::BiasCompute(CpuKernelContext &ctx) {
|
||||
auto input0_tensor = ctx.Input(0);
|
||||
auto input2_tensor = ctx.Input(2);
|
||||
auto input2_shape = input2_tensor->GetTensorShape()->GetDimSizes();
|
||||
auto output_tensor = ctx.Output(kFirstOutputIndex);
|
||||
auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
|
||||
|
||||
KERNEL_CHECK_FALSE(input2_tensor->GetTensorShape()->GetDims() == 1, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[x3] must be a 1D tensor")
|
||||
|
||||
DataType input0_data_type = input0_tensor->GetDataType();
|
||||
DataType input2_data_type = input2_tensor->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_data_type == input2_data_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[x1] data type[%s] and input[x3] data type[%s] must be same",
|
||||
DTypeStr(input0_data_type).c_str(), DTypeStr(input2_data_type).c_str())
|
||||
|
||||
Bcast bcast(input2_shape, output_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return AddCompute<T>(ctx, bcast);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatMulCpuKernel::MatMulCompute(CpuKernelContext &ctx) {
|
||||
auto input0_tensor = ctx.Input(0);
|
||||
auto input0_tensor_shape = input0_tensor->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((IsMatrix(input0_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[x1] must be a matrix")
|
||||
|
||||
auto input1_tensor = ctx.Input(1);
|
||||
auto input1_tensor_shape = input1_tensor->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((IsMatrix(input1_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[x2] must be a matrix")
|
||||
|
||||
auto transpose_x1 = ctx.GetAttr("transpose_x1")->GetBool();
|
||||
auto transpose_x2 = ctx.GetAttr("transpose_x2")->GetBool();
|
||||
KERNEL_LOG_DEBUG(
|
||||
"%s Attr[transpose_x1] value[%d], "
|
||||
"Attr[transpose_x2] value[%d].",
|
||||
kMatmul, transpose_x1, transpose_x2);
|
||||
int32_t x1_dim = transpose_x1 ? 0 : 1;
|
||||
int32_t x2_dim = transpose_x2 ? 1 : 0;
|
||||
KERNEL_CHECK_FALSE((input0_tensor_shape->GetDimSize(x1_dim) == input1_tensor_shape->GetDimSize(x2_dim)),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Matrix size incompatible, input[x1] dim[%d] value[%lld], "
|
||||
"input[x2] dim[%d] value[%lld]",
|
||||
x1_dim, input0_tensor_shape->GetDimSize(x1_dim), x2_dim, input1_tensor_shape->GetDimSize(x2_dim))
|
||||
|
||||
auto input0_shape = input0_tensor_shape->GetDimSizes();
|
||||
using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
|
||||
MatrixMap input0(reinterpret_cast<T *>(input0_tensor->GetData()), input0_shape[0], input0_shape[1]);
|
||||
|
||||
auto input1_shape = input1_tensor_shape->GetDimSizes();
|
||||
MatrixMap input1(reinterpret_cast<T *>(input1_tensor->GetData()), input1_shape[0], input1_shape[1]);
|
||||
|
||||
auto output_tensor = ctx.Output(kFirstOutputIndex);
|
||||
auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
|
||||
MatrixMap output(reinterpret_cast<T *>(output_tensor->GetData()), output_shape[0], output_shape[1]);
|
||||
if (transpose_x1) {
|
||||
if (transpose_x2) {
|
||||
output = input0.transpose() * input1.transpose();
|
||||
} else {
|
||||
output = input0.transpose() * input1;
|
||||
}
|
||||
} else {
|
||||
if (transpose_x2) {
|
||||
output = input0 * input1.transpose();
|
||||
} else {
|
||||
output = input0 * input1;
|
||||
}
|
||||
}
|
||||
if (ctx.GetInputsSize() == 3) {
|
||||
return BiasCompute<T>(ctx);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MatMulCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
uint32_t input_num = ctx.GetInputsSize();
|
||||
uint32_t output_num = ctx.GetOutputsSize();
|
||||
if ((input_num != 2 && input_num != 3) || output_num != 1) {
|
||||
KERNEL_LOG_ERROR("The number of input or output parameters does not match.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
auto input0_tensor = ctx.Input(0);
|
||||
KERNEL_CHECK_NULLPTR(input0_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get input[x1] data failed",
|
||||
ctx.GetOpType().c_str())
|
||||
|
||||
auto input1_tensor = ctx.Input(1);
|
||||
auto input1_tensor_shape = input1_tensor->GetTensorShape();
|
||||
KERNEL_CHECK_NULLPTR(input1_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get input[x2] data failed",
|
||||
ctx.GetOpType().c_str())
|
||||
|
||||
DataType input0_data_type = input0_tensor->GetDataType();
|
||||
DataType input1_data_type = input1_tensor->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_data_type == input1_data_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[x1] data type[%s] and input[x2] data type[%s] must be same",
|
||||
DTypeStr(input0_data_type).c_str(), DTypeStr(input1_data_type).c_str())
|
||||
KERNEL_LOG_DEBUG("%s op input[x1] data type is [%s].", kMatmul, DTypeStr(input0_data_type).c_str());
|
||||
uint32_t ret = KERNEL_STATUS_OK;
|
||||
switch (input0_data_type) {
|
||||
case DT_FLOAT:
|
||||
ret = MatMulCompute<float>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
ret = MatMulCompute<double>(ctx);
|
||||
break;
|
||||
case DT_FLOAT16:
|
||||
ret = MatMulCompute<Eigen::half>(ctx);
|
||||
break;
|
||||
case DT_INT32:
|
||||
ret = MatMulCompute<int32_t>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
ret = MatMulCompute<std::complex<float>>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
ret = MatMulCompute<std::complex<double>>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(input0_data_type).c_str());
|
||||
ret = KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMatmul, MatMulCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_HOST_MATMUL_H_
|
||||
#define AICPU_KERNELS_HOST_MATMUL_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MatMulCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MatMulCpuKernel() = default;
|
||||
~MatMulCpuKernel() = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t AddCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
template <typename T>
|
||||
uint32_t BiasCompute(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t MatMulCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,320 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "matrix_exp.h"
|
||||
|
||||
#include <array>
|
||||
#include <complex>
|
||||
#include <cmath>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
constexpr uint32_t kMatrixExpInputNum = 1;
|
||||
constexpr uint32_t kMatrixExpOutputNum = 1;
|
||||
constexpr uint32_t kIndexTwo = 2;
|
||||
const int64_t paralled_data_size = 8 * 1024;
|
||||
const char *kMatrixExp = "MatrixExp";
|
||||
constexpr int total_n_degs = 6;
|
||||
|
||||
// Coefficients for computing taylor approximant of order 8.
|
||||
constexpr double sqrt_177 = 0.1330413469565007072504e+2, x3 = 2. / 3.;
|
||||
constexpr double x1 = x3 * ((1. + sqrt_177) / 88.), x2 = x3 * ((1. + sqrt_177) / 352.);
|
||||
constexpr double x4 = (-271. + 29. * sqrt_177) / (315. * x3), x5 = (-11. + 11. * sqrt_177) / (1260. * x3);
|
||||
constexpr double x6 = (-99. + 11. * sqrt_177) / (5040. * x3), x7 = (89. - sqrt_177) / (5040. * x3);
|
||||
constexpr double y2 = (857. - 58. * sqrt_177) / 630.;
|
||||
|
||||
template <typename T, int ROW, int COL>
|
||||
using array2d = std::array<std::array<T, COL>, ROW>;
|
||||
|
||||
// Coefficients for computing taylor approximant of order 12.
|
||||
constexpr int num_prods_12 = 4;
|
||||
array2d<double, num_prods_12, num_prods_12> b12 = {
|
||||
{{9.0198e-16, 0.46932117595418237389, -0.20099424927047284052, -0.04623946134063071740},
|
||||
{5.31597895759871264183, 1.19926790417132231573, 0.01179296240992997031, 0.01108844528519167989},
|
||||
{0.18188869982170434744, 0.05502798439925399070, 0.09351590770535414968, 0.00610700528898058230},
|
||||
{-2.0861320e-13, -0.13181061013830184015, -0.02027855540589259079, -0.00675951846863086359}}};
|
||||
|
||||
// Coefficients for computing taylor approximant of order 18.
|
||||
constexpr int num_prods_18 = 5;
|
||||
array2d<double, num_prods_18, num_prods_18> b18 = {
|
||||
{{0., -1.00365581030144618291e-01, -8.02924648241156932449e-03, -8.92138498045729985177e-04, 0.},
|
||||
{0., 3.97849749499645077844e-01, 1.36783778460411720168e+00, 4.98289622525382669416e-01,
|
||||
-6.37898194594723280150e-04},
|
||||
{-1.09676396052962061844e+01, 1.68015813878906206114e+00, 5.71779846478865511061e-02, -6.98210122488052056106e-03,
|
||||
3.34975017086070470649e-05},
|
||||
{-9.04316832390810593223e-02, -6.76404519071381882256e-02, 6.75961301770459654925e-02, 2.95552570429315521194e-02,
|
||||
-1.39180257516060693404e-05},
|
||||
{0., 0., -9.23364619367118555360e-02, -1.69364939002081722752e-02, -1.40086798182036094347e-05}}};
|
||||
|
||||
// Threshold for different order of taylor approximant.
|
||||
constexpr std::array<float, total_n_degs> thetas_float = {1.192092800768788e-07, 5.978858893805233e-04,
|
||||
5.116619363445086e-02, 5.800524627688768e-01,
|
||||
1.461661507209034e+00, 3.010066362817634e+00};
|
||||
|
||||
// Threshold for different order of taylor approximant.
|
||||
constexpr std::array<double, total_n_degs> thetas_double = {2.220446049250313e-16, 2.580956802971767e-08,
|
||||
3.397168839976962e-04, 4.991228871115323e-02,
|
||||
2.996158913811580e-01, 1.090863719290036e+00};
|
||||
|
||||
#define MATRIX_EXP_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MatrixExpCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("MatrixExp kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define MATRIX_EXP_COMPUTE_DIFF_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MatrixExpDiffTypeCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("MatrixExp kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MatrixExpCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMatrixExpInputNum, kMatrixExpOutputNum),
|
||||
"[%s] check input and output number failed.", kMatrixExp);
|
||||
KERNEL_HANDLE_ERROR(MatrixExpCheck(ctx), "[%s] check params failed.", kMatrixExp);
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
MATRIX_EXP_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
MATRIX_EXP_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
MATRIX_EXP_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
MATRIX_EXP_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
MATRIX_EXP_COMPUTE_DIFF_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MatrixExp kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MatrixExpCpuKernel::MatrixExpCheck(CpuKernelContext &ctx) {
|
||||
auto input_0 = ctx.Input(0);
|
||||
std::vector<int64_t> shape_x = input_0->GetTensorShape()->GetDimSizes();
|
||||
size_t shape_size_x = shape_x.size();
|
||||
KERNEL_CHECK_FALSE((shape_size_x > 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 2, got [%zu].",
|
||||
shape_size_x)
|
||||
KERNEL_CHECK_FALSE((shape_x[shape_size_x - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input x's last dimension must be at least 1.")
|
||||
KERNEL_CHECK_FALSE((shape_x[shape_size_x - kIndexTwo] == shape_x[shape_size_x - 1]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input x's last two dimensions must be equal, but are [%lld] and [%lld].",
|
||||
shape_x[shape_size_x - kIndexTwo], shape_x[shape_size_x - 1])
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename Derived1, typename Derived2, typename Derived3>
|
||||
void MatrixExpCpuKernel::MTaylorApproximant(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
|
||||
int order, Eigen::MatrixBase<Derived3> &E) {
|
||||
constexpr int expension_order_1 = 1;
|
||||
constexpr int expension_order_2 = 2;
|
||||
constexpr int expension_order_4 = 4;
|
||||
constexpr int expension_order_8 = 8;
|
||||
constexpr int expension_order_12 = 12;
|
||||
auto A2 = A * A;
|
||||
auto A3 = A * A2;
|
||||
if (order == expension_order_1) {
|
||||
E = I + A;
|
||||
} else if (order == expension_order_2) {
|
||||
constexpr int A2_divisor = 2;
|
||||
E = I + A + A2 / A2_divisor;
|
||||
} else if (order == expension_order_4) {
|
||||
constexpr int I_divisor = 2;
|
||||
constexpr int A_divisor = 6;
|
||||
constexpr int A2_divisor = 24;
|
||||
E = I + A + A2 * (I / I_divisor + A / A_divisor + A2 / A2_divisor);
|
||||
} else if (order == expension_order_8) {
|
||||
auto A4 = A2 * (x1 * A + x2 * A2);
|
||||
auto A8 = (x3 * A2 + A4) * (x4 * I + x5 * A + x6 * A2 + x7 * A4);
|
||||
E = I + A + y2 * A2 + A8;
|
||||
} else if (order == expension_order_12) {
|
||||
auto q31 = b12[0][0] * I + b12[0][1] * A + b12[0][2] * A2 + b12[0][3] * A3;
|
||||
auto q32 = b12[1][0] * I + b12[1][1] * A + b12[1][2] * A2 + b12[1][3] * A3;
|
||||
auto q33 = b12[2][0] * I + b12[2][1] * A + b12[2][2] * A2 + b12[2][3] * A3;
|
||||
auto q34 = b12[3][0] * I + b12[3][1] * A + b12[3][2] * A2 + b12[3][3] * A3;
|
||||
auto q61 = q33 + q34 * q34;
|
||||
E = q31 + (q32 + q61) * q61;
|
||||
} else {
|
||||
auto A6 = A3 * A3;
|
||||
auto q31 = b18[0][0] * I + b18[0][1] * A + b18[0][2] * A2 + b18[0][3] * A3 + b18[0][4] * A6;
|
||||
auto q61 = b18[1][0] * I + b18[1][1] * A + b18[1][2] * A2 + b18[1][3] * A3 + b18[1][4] * A6;
|
||||
auto q62 = b18[2][0] * I + b18[2][1] * A + b18[2][2] * A2 + b18[2][3] * A3 + b18[2][4] * A6;
|
||||
auto q63 = b18[3][0] * I + b18[3][1] * A + b18[3][2] * A2 + b18[3][3] * A3 + b18[3][4] * A6;
|
||||
auto q64 = b18[4][0] * I + b18[4][1] * A + b18[4][2] * A2 + b18[4][3] * A3 + b18[4][4] * A6;
|
||||
auto q91 = q31 * q64 + q63;
|
||||
E = q61 + (q62 + q91) * q91;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Derived1, typename Derived2>
|
||||
void MatrixExpCpuKernel::MexpImpl(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
|
||||
Eigen::MatrixBase<Derived1> &mexp, CpuKernelContext &ctx) {
|
||||
const auto norm = A.cwiseAbs().colwise().sum().maxCoeff();
|
||||
constexpr std::array<int, total_n_degs> m_vals = {1, 2, 4, 8, 12, 18};
|
||||
constexpr int cut_deg = 2;
|
||||
int64_t s = -1;
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
if (data_type == DT_FLOAT16 || data_type == DT_FLOAT || data_type == DT_COMPLEX64) {
|
||||
for (int i = 0; i < total_n_degs - 1; i++) {
|
||||
if (norm <= thetas_float[i]) {
|
||||
MTaylorApproximant(A, I, m_vals[i], mexp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (norm >= thetas_float[total_n_degs - cut_deg]) {
|
||||
s = ceil(log2(norm / thetas_float[total_n_degs - 1]));
|
||||
if (s <= 0) {
|
||||
s = 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < total_n_degs - 1; i++) {
|
||||
if (norm <= thetas_double[i]) {
|
||||
MTaylorApproximant(A, I, m_vals[i], mexp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (norm >= thetas_double[total_n_degs - cut_deg]) {
|
||||
s = ceil(log2(norm / thetas_double[total_n_degs - 1]));
|
||||
if (s <= 0) {
|
||||
s = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (s >= 0) {
|
||||
const auto pow2s = pow(2, s);
|
||||
const auto A_scaled = A / pow2s;
|
||||
MTaylorApproximant(A_scaled, I, m_vals[total_n_degs - 1], mexp);
|
||||
for (int k = 0; k < s; k++) {
|
||||
mexp = mexp * mexp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixExpCpuKernel::MatrixExpCompute(CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
|
||||
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
size_t shape_size = shape_x.size();
|
||||
int64_t m = shape_x[shape_size - 1];
|
||||
int64_t size_mm = m * m;
|
||||
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
|
||||
MatrixXd I(m, m);
|
||||
I.setIdentity();
|
||||
int64_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
|
||||
int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
|
||||
if (data_size <= paralled_data_size) {
|
||||
for (int64_t i = 0; i < matrix_num; i++) {
|
||||
Eigen::Map<MatrixXd> matrix_x(input_x + i * m * m, m, m);
|
||||
Eigen::Map<MatrixXd> matrix_y(output_y + i * m * m, m, m);
|
||||
if (matrix_x.size() > 0) {
|
||||
MexpImpl(matrix_x, I, matrix_y, ctx);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num == 0) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (max_core_num > matrix_num) {
|
||||
max_core_num = matrix_num;
|
||||
}
|
||||
auto shard_work = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
Eigen::Map<MatrixXd> matrix_x(input_x + i * m * m, m, m);
|
||||
Eigen::Map<MatrixXd> matrix_y(output_y + i * m * m, m, m);
|
||||
if (matrix_x.size() > 0) {
|
||||
MexpImpl(matrix_x, I, matrix_y, ctx);
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_work),
|
||||
"MatrixExp Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
void MatrixExpCpuKernel::TyepChangeForFp16(int64_t i, int64_t m, Eigen::half *input_x, Eigen::half *output_y,
|
||||
CpuKernelContext &ctx) {
|
||||
typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
|
||||
MatrixXd I(m, m);
|
||||
(void)I.setIdentity();
|
||||
MatrixXd matrix_x(m, m);
|
||||
MatrixXd matrix_y(m, m);
|
||||
int64_t size_mm = m * m;
|
||||
for (int p = 0; p < m; p++) {
|
||||
for (int q = 0; q < m; q++) {
|
||||
matrix_x(p, q) = static_cast<float>(input_x[i * size_mm + p * m + q]);
|
||||
}
|
||||
}
|
||||
if (matrix_x.size() > 0) {
|
||||
MexpImpl(matrix_x, I, matrix_y, ctx);
|
||||
}
|
||||
for (int p = 0; p < m; p++) {
|
||||
for (int q = 0; q < m; q++) {
|
||||
output_y[i * size_mm + p * m + q] = static_cast<Eigen::half>(matrix_y(p, q));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixExpCpuKernel::MatrixExpDiffTypeCompute(CpuKernelContext &ctx) {
|
||||
T *input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
|
||||
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
size_t shape_size = shape_x.size();
|
||||
int64_t m = shape_x[shape_size - 1];
|
||||
int64_t size_mm = m * m;
|
||||
int64_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
|
||||
int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
|
||||
if (data_size <= paralled_data_size) {
|
||||
for (int64_t i = 0; i < matrix_num; i++) {
|
||||
TyepChangeForFp16(i, m, input_x, output_y, ctx);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num == 0) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (max_core_num > matrix_num) {
|
||||
max_core_num = matrix_num;
|
||||
}
|
||||
auto shard_work = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
TyepChangeForFp16(i, m, input_x, output_y, ctx);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_work),
|
||||
"MatrixExp Compute failed.");
|
||||
}
|
||||
// }
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMatrixExp, MatrixExpCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_EXP_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MATRIX_EXP_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
namespace aicpu {
|
||||
class MatrixExpCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MatrixExpCpuKernel() = default;
|
||||
~MatrixExpCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MatrixExpCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename Derived1, typename Derived2, typename Derived3>
|
||||
void MTaylorApproximant(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I, int order,
|
||||
Eigen::MatrixBase<Derived3> &E);
|
||||
|
||||
template <typename Derived1, typename Derived2>
|
||||
void MexpImpl(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
|
||||
Eigen::MatrixBase<Derived1> &mexp, CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixExpCompute(CpuKernelContext &ctx);
|
||||
|
||||
void TyepChangeForFp16(int64_t i, int64_t m, Eigen::half *input_x, Eigen::half *output_y, CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixExpDiffTypeCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,460 @@
|
|||
/**
|
||||
* Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unminimum required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "maximum.h"
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "cmath"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "iostream"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
namespace {
|
||||
const uint32_t kInputNum = 2;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kMaximum = "Maximum";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
|
||||
#define MAXIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MaximumCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Maximum kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MaximumCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Maximum check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MaximumParamCheck(ctx), "Maximum check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
MAXIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
MAXIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
MAXIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
MAXIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
MAXIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Maximum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MaximumCpuKernel::MaximumParamCheck(CpuKernelContext &ctx) {
|
||||
// the non null of input_0, input_1, output has been verified in NormalCheck
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"MaximumCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MaximumCpuKernel::SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
|
||||
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto ignore_nan = false;
|
||||
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (ignore_nan == true && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else if (Eigen::numext::isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == false) {
|
||||
if (isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else if (isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else if (Eigen::numext::isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == false) {
|
||||
if (isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else if (isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MaximumCpuKernel::SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
|
||||
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto ignore_nan = false;
|
||||
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (ignore_nan == true && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(input1))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else if (Eigen::numext::isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input1);
|
||||
} else {
|
||||
*(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == false) {
|
||||
if (isnan(*(input1))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else if (isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input1);
|
||||
} else {
|
||||
*(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(input1))) {
|
||||
*(output + i) = *(input1);
|
||||
} else if (Eigen::numext::isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else {
|
||||
*(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == false) {
|
||||
if (isnan(*(input1))) {
|
||||
*(output + i) = *(input1);
|
||||
} else if (isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else {
|
||||
*(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MaximumCpuKernel::SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
|
||||
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto ignore_nan = false;
|
||||
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (ignore_nan == true && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input2);
|
||||
} else if (Eigen::numext::isnan(*(input2))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == false) {
|
||||
if (isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input2);
|
||||
} else if (isnan(*(input2))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else if (Eigen::numext::isnan(*(input2))) {
|
||||
*(output + i) = *(input2);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == false) {
|
||||
if (isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else if (isnan(*(input2))) {
|
||||
*(output + i) = *(input2);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MaximumCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
|
||||
bool is_float16 = false;
|
||||
if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
|
||||
std::is_same<T, double>::value) {
|
||||
is_float16 = false;
|
||||
} else {
|
||||
is_float16 = true;
|
||||
}
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
SpecialComputeSameShape<T>(start, end, ctx, is_float16);
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
SpecialComputeXOneElement<T>(start, end, ctx, is_float16);
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
SpecialComputeYOneElement<T>(start, end, ctx, is_float16);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MaximumCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
|
||||
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
BcastShapeType type = in0_elements_nums == in1_elements_nums
|
||||
? BcastShapeType::SAME_SHAPE
|
||||
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_fmax = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_fmax),
|
||||
"Maximum Compute failed.");
|
||||
} else {
|
||||
SpecialCompute<T>(type, 0, data_num, ctx);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MaximumCpuKernel::BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast,
|
||||
bool is_float16) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto ignore_nan = false;
|
||||
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (ignore_nan == true && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == false) {
|
||||
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == false) {
|
||||
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MaximumCpuKernel::BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto ignore_nan = false;
|
||||
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
if (ignore_nan == true && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == false) {
|
||||
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == false) {
|
||||
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MaximumCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
bool is_float16 = false;
|
||||
if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
|
||||
std::is_same<T, double>::value) {
|
||||
is_float16 = false;
|
||||
} else {
|
||||
is_float16 = true;
|
||||
}
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_fmax = [&](int64_t start, int64_t end) {
|
||||
BcastComputeMultiKernel<T>(start, end, ctx, bcast, is_float16);
|
||||
};
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_fmax),
|
||||
"Maximum Compute failed.");
|
||||
} else {
|
||||
BcastComputeOneKernel<T>(ctx, bcast, is_float16);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MaximumCpuKernel::MaximumCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
|
||||
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
|
||||
if (isNeedBcast) {
|
||||
return NoBcastCompute<T>(ctx);
|
||||
} else {
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMaximum, MaximumCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,63 @@
|
|||
/**
|
||||
* Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unminimum required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MAXIMUM_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MAXIMUM_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MaximumCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MaximumCpuKernel() = default;
|
||||
~MaximumCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MaximumParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
|
||||
|
||||
template <typename T>
|
||||
void SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
|
||||
|
||||
template <typename T>
|
||||
void SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
|
||||
template <typename T>
|
||||
void BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
|
||||
|
||||
template <typename T>
|
||||
void BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
|
||||
|
||||
template <typename T>
|
||||
uint32_t MaximumCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,456 @@
|
|||
/**
|
||||
* Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unminimum required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minimum.h"
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "cmath"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "iostream"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
namespace {
|
||||
const uint32_t kInputNum = 2;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kMinimum = "Minimum";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
|
||||
#define MINIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MinimumCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Minimum kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MinimumCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Minimum check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MinimumParamCheck(ctx), "Minimum check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
|
||||
switch (data_type) {
|
||||
MINIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
MINIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
MINIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
MINIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
MINIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Minimum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MinimumCpuKernel::MinimumParamCheck(CpuKernelContext &ctx) {
|
||||
// the non null of input_0, input_1, output has been verified in NormalCheck
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"MinimumCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MinimumCpuKernel::SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
|
||||
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto ignore_nan = false;
|
||||
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (ignore_nan == false && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else if (Eigen::numext::isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == false) {
|
||||
if (isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else if (isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else if (Eigen::numext::isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == false) {
|
||||
if (isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else if (isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void MinimumCpuKernel::SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
|
||||
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto ignore_nan = false;
|
||||
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (ignore_nan == false && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*input1)) {
|
||||
*(output + i) = *input1;
|
||||
} else if (Eigen::numext::isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else {
|
||||
*(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == false) {
|
||||
if (isnan(*input1)) {
|
||||
*(output + i) = *input1;
|
||||
} else if (isnan(*(input2 + i))) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else {
|
||||
*(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*input1)) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else if (Eigen::numext::isnan(*(input2 + i))) {
|
||||
*(output + i) = *input1;
|
||||
} else {
|
||||
*(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == false) {
|
||||
if (isnan(*input1)) {
|
||||
*(output + i) = *(input2 + i);
|
||||
} else if (isnan(*(input2 + i))) {
|
||||
*(output + i) = *input1;
|
||||
} else {
|
||||
*(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void MinimumCpuKernel::SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) {
|
||||
auto input1 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input2 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto ignore_nan = false;
|
||||
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (ignore_nan == false && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else if (Eigen::numext::isnan(*input2)) {
|
||||
*(output + i) = *input2;
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == false) {
|
||||
if (isnan(*(input1 + i))) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else if (isnan(*input2)) {
|
||||
*(output + i) = *input2;
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(input1 + i))) {
|
||||
*(output + i) = *input2;
|
||||
} else if (Eigen::numext::isnan(*input2)) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == false) {
|
||||
if (isnan(*(input1 + i))) {
|
||||
*(output + i) = *input2;
|
||||
} else if (isnan(*input2)) {
|
||||
*(output + i) = *(input1 + i);
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MinimumCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
|
||||
bool is_float16 = false;
|
||||
if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
|
||||
std::is_same<T, double>::value) {
|
||||
is_float16 = false;
|
||||
} else {
|
||||
is_float16 = true;
|
||||
}
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
SpecialComputeSameShape<T>(start, end, ctx, is_float16);
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
SpecialComputeXOneElement<T>(start, end, ctx, is_float16);
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
SpecialComputeYOneElement<T>(start, end, ctx, is_float16);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MinimumCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
|
||||
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
BcastShapeType type = in0_elements_nums == in1_elements_nums
|
||||
? BcastShapeType::SAME_SHAPE
|
||||
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_minimum = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_minimum),
|
||||
"Minimum Compute failed.");
|
||||
} else {
|
||||
SpecialCompute<T>(type, 0, data_num, ctx);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
template <typename T>
|
||||
void MinimumCpuKernel::BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast,
|
||||
bool is_float16) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto ignore_nan = false;
|
||||
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (ignore_nan == false && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == false) {
|
||||
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
|
||||
if (ignore_nan == true && is_float16 == false) {
|
||||
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MinimumCpuKernel::BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto ignore_nan = false;
|
||||
auto ignore_nan_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
if (ignore_nan == false && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == false && is_float16 == false) {
|
||||
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == true) {
|
||||
if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
if (ignore_nan == true && is_float16 == false) {
|
||||
if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) {
|
||||
*(out + i) = *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
} else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i));
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i))
|
||||
? *(in0 + bcast.GetBroadcastXIndex(i))
|
||||
: *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MinimumCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
bool is_float16 = false;
|
||||
if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value || std::is_same<T, float>::value ||
|
||||
std::is_same<T, double>::value) {
|
||||
is_float16 = false;
|
||||
} else {
|
||||
is_float16 = true;
|
||||
}
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_minimum = [&](int64_t start, int64_t end) {
|
||||
BcastComputeMultiKernel<T>(start, end, ctx, bcast, is_float16);
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_minimum),
|
||||
"Minimum Compute failed.");
|
||||
} else {
|
||||
BcastComputeOneKernel<T>(ctx, bcast, is_float16);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MinimumCpuKernel::MinimumCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
|
||||
if (isNeedBcast) {
|
||||
return NoBcastCompute<T>(ctx);
|
||||
} else {
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMinimum, MinimumCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,63 @@
|
|||
/**
|
||||
* Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unminimum required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MINIMUM_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MINIMUM_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MinimumCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MinimumCpuKernel() = default;
|
||||
~MinimumCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MinimumParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
|
||||
|
||||
template <typename T>
|
||||
void SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
|
||||
|
||||
template <typename T>
|
||||
void SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
|
||||
template <typename T>
|
||||
void BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
|
||||
|
||||
template <typename T>
|
||||
void BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16);
|
||||
|
||||
template <typename T>
|
||||
uint32_t MinimumCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
|
|
@ -171,11 +171,8 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
|
|||
mindspore::kKLDivOpName,
|
||||
mindspore::kKlDivLossGradOpName,
|
||||
mindspore::kLcmOpName,
|
||||
mindspore::kLessEqualOpName,
|
||||
mindspore::kLogicalXorOpName,
|
||||
mindspore::kLogitOpName,
|
||||
mindspore::kLogitGradOpName,
|
||||
mindspore::kLogNormalReverseOpName,
|
||||
mindspore::kLowerBoundOpName,
|
||||
mindspore::kLstsqOpName,
|
||||
mindspore::kLuUnpackOpName,
|
||||
|
|
Loading…
Reference in New Issue