merge canndev code to mindspore

This commit is contained in:
shenjingxing2 2023-02-06 19:45:40 +08:00
parent aacab0ca60
commit 6920953b56
32 changed files with 4396 additions and 1 deletions

View File

@ -135,4 +135,5 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/comma"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "runtime/indentation_namespace"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/line_length"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/semicolon"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "readability/nolint"

View File

@ -350,3 +350,8 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/resize_bicubic_grad.cc:aicpu::ResizeBicubicGrad
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_max.cc:aicpu::SegmentMaxCpuKernel::SegmentMaxCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/extract_glimpse.cc:aicpu::ExtractGlimpseCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_grad_with_argmax.cc:aicpu::MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_power.cc:aicpu::MatrixPowerCpuKernel::ComputeKernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d_grad.cc:aicpu::MaxUnpool2DGradCpuKernel::MaxUnpool2DGradCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_with_argmax.cc:aicpu::MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/layer_norm_grad_grad.cc:aicpu::LayerNormGradGradCpuKernel::LayerNormGradGradCompute

View File

@ -0,0 +1,226 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "layer_norm_grad_grad.h"
#include <cmath>
#include <numeric>
#include <vector>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
using namespace std;
namespace {
const uint32_t kOutputNum = 3;
const uint32_t kInputNum = 8;
const char *kLayerNormGradGrad = "LayerNormGradGrad";
#define LAYERNORMGRADGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX, NUM) \
case (DTYPE): { \
uint32_t result = LayerNormGradGradCompute<TYPE>(CTX, NUM); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("LayerNormGradGrad kernel compute failed."); \
return result; \
} \
break; \
}
#define SWITCH_PARALLEL(SHARD, data_num, thread_num) \
if (data_num <= ParallelDataNums) { \
for (size_t i = 0; i < thread_num; i++) { \
SHARD(i, i + 1); \
} \
} else { \
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, thread_num, 1, SHARD), \
"LayerNormGradGrad ParallelFor Compute failed."); \
}
Eigen::half sqrt(Eigen::half &data) { return Eigen::half_impl::sqrt(data); }
} // namespace
namespace aicpu {
uint32_t LayerNormGradGradCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"LayerNormGradGrad check input and output number failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
LAYERNORMGRADGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx, 512)
LAYERNORMGRADGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx, 4 * 1024)
default:
KERNEL_LOG_ERROR("LayerNormGradGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t LayerNormGradGradCpuKernel::LayerNormGradGradCompute(CpuKernelContext &ctx, size_t ParallelDataNums) {
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input_dy = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto input_var = reinterpret_cast<T *>(ctx.Input(2)->GetData());
auto input_mean = reinterpret_cast<T *>(ctx.Input(3)->GetData());
auto input_gamma = reinterpret_cast<T *>(ctx.Input(4)->GetData());
auto input_d_dx = reinterpret_cast<T *>(ctx.Input(5)->GetData());
auto input_d_dg = reinterpret_cast<T *>(ctx.Input(6)->GetData());
auto input_d_db = reinterpret_cast<T *>(ctx.Input(7)->GetData());
auto output_sopd_x = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto output_sopd_dy = reinterpret_cast<T *>(ctx.Output(1)->GetData());
auto output_sopd_g = reinterpret_cast<T *>(ctx.Output(2)->GetData());
size_t num = static_cast<size_t>(ctx.Input(0)->NumElements());
size_t g_num = static_cast<size_t>(ctx.Input(4)->NumElements());
size_t mean_num = static_cast<size_t>(ctx.Input(3)->NumElements());
KERNEL_CHECK_FALSE((g_num > 0), KERNEL_STATUS_PARAM_INVALID, "gamma should not be empty");
T *inv_std = new T[mean_num];
for (size_t i = 0; i < mean_num; i++) {
if (input_var[i] <= T(0)) {
KERNEL_LOG_ERROR("variance must be greater than zero");
return KERNEL_STATUS_PARAM_INVALID;
}
inv_std[i] = T(1) / sqrt(input_var[i]);
}
T *x_hat = new T[num];
T *dy_gamma = new T[num];
T *sum1 = new T[mean_num];
std::fill_n(sum1, mean_num, T(0));
T *sum2 = new T[mean_num];
std::fill_n(sum2, mean_num, T(0));
T *sum3 = new T[mean_num];
std::fill_n(sum3, mean_num, T(0));
T *sum4 = new T[mean_num];
std::fill_n(sum4, mean_num, T(0));
auto shard_inner_mean = [&](size_t start, size_t end) {
for (size_t sum_idx = start; sum_idx < end; sum_idx++) {
for (size_t g_idx = 0; g_idx < g_num; g_idx++) {
size_t i = g_idx + sum_idx * g_num; // value of sum_idx = i / g_num;
sum1[sum_idx] -= inv_std[sum_idx] * input_d_dx[i] / static_cast<T>(g_num);
;
T cur_x_hat = (input_x[i] - input_mean[sum_idx]) * inv_std[sum_idx];
x_hat[i] = cur_x_hat;
sum2[sum_idx] -= cur_x_hat * inv_std[sum_idx] * input_d_dx[i] / static_cast<T>(g_num);
;
T cur_dy_gamma = input_dy[i] * input_gamma[g_idx];
dy_gamma[i] = cur_dy_gamma;
sum3[sum_idx] += cur_dy_gamma / static_cast<T>(g_num);
;
sum4[sum_idx] += cur_dy_gamma * cur_x_hat / static_cast<T>(g_num);
;
}
}
};
SWITCH_PARALLEL(shard_inner_mean, num, mean_num);
T *sum5 = new T[mean_num];
std::fill_n(sum5, mean_num, T(0));
T *sum6 = new T[mean_num];
std::fill_n(sum6, mean_num, T(0));
T *sum7 = new T[mean_num];
std::fill_n(sum7, mean_num, T(0));
T *part3 = new T[num];
auto shard_outer_mean = [&](size_t start, size_t end) {
for (size_t sum_idx = start; sum_idx < end; sum_idx++) {
for (size_t g_idx = 0; g_idx < g_num; g_idx++) {
size_t i = g_idx + sum_idx * g_num; // value of sum_idx is i / g_num;
T part_sum1 = dy_gamma[i] - sum3[sum_idx] - x_hat[i] * sum4[sum_idx];
T part_sum2 = dy_gamma[i] * sum2[sum_idx] - sum4[sum_idx] * input_d_dx[i] * inv_std[sum_idx] +
input_dy[i] * input_d_dg[g_idx];
sum5[sum_idx] += input_d_dx[i] * part_sum1 / static_cast<T>(g_num);
;
sum6[sum_idx] += (input_x[i] - input_mean[sum_idx]) * part_sum2 / static_cast<T>(g_num);
;
T cur_part3 = inv_std[sum_idx] * part_sum2;
part3[i] = cur_part3;
sum7[sum_idx] -= cur_part3 / static_cast<T>(g_num);
;
}
}
};
SWITCH_PARALLEL(shard_outer_mean, num, mean_num);
if (sum3 != nullptr) {
delete[] sum3;
}
if (sum4 != nullptr) {
delete[] sum4;
}
if (dy_gamma != nullptr) {
delete[] dy_gamma;
}
auto shard_input_prop = [&](size_t start, size_t end) {
for (size_t sum_idx = start; sum_idx < end; sum_idx++) {
for (size_t g_idx = 0; g_idx < g_num; g_idx++) {
size_t i = g_idx + sum_idx * g_num; // value of sum_idx is i / g_num;
T cur_part4 = -x_hat[i] * inv_std[sum_idx] * inv_std[sum_idx] * (sum5[sum_idx] + sum6[sum_idx]);
output_sopd_x[i] = part3[i] + cur_part4 + sum7[sum_idx];
T cur_part5 = input_gamma[g_idx] * input_d_dx[i] * inv_std[sum_idx];
T cur_part6 = input_gamma[g_idx] * sum1[sum_idx];
T cur_part7 = input_gamma[g_idx] * x_hat[i] * sum2[sum_idx];
T cur_part8 = x_hat[i] * input_d_dg[g_idx];
output_sopd_dy[i] = cur_part5 + cur_part6 + cur_part7 + cur_part8 + input_d_db[g_idx];
}
}
};
SWITCH_PARALLEL(shard_input_prop, num, mean_num);
if (sum5 != nullptr) {
delete[] sum5;
}
if (sum6 != nullptr) {
delete[] sum6;
}
if (sum7 != nullptr) {
delete[] sum7;
}
std::fill_n(output_sopd_g, g_num, T(0));
auto shard_param_prop = [&](size_t start, size_t end) {
for (size_t g_idx = start; g_idx < end; g_idx++) {
for (size_t sum_idx = 0; sum_idx < mean_num; sum_idx++) {
size_t i = g_idx + sum_idx * g_num; // value of sum_idx is i / g_num;
T cur_part9 = input_dy[i] * x_hat[i] * sum2[sum_idx];
T cur_part10 = input_dy[i] * sum1[sum_idx];
T cur_part11 = input_dy[i] * input_d_dx[i] * inv_std[sum_idx];
output_sopd_g[g_idx] += cur_part9 + cur_part10 + cur_part11;
}
}
};
SWITCH_PARALLEL(shard_param_prop, num, g_num);
if (sum1 != nullptr) {
delete[] sum1;
}
if (sum2 != nullptr) {
delete[] sum2;
}
if (inv_std != nullptr) {
delete[] inv_std;
}
if (x_hat != nullptr) {
delete[] x_hat;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLayerNormGradGrad, LayerNormGradGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,42 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LAYERNORMGRADGRAD_H_
#define AICPU_KERNELS_NORMALIZED_LAYERNORMGRADGRAD_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
#include "utils/eigen_tensor.h"
namespace aicpu {
class LayerNormGradGradCpuKernel : public CpuKernel {
public:
LayerNormGradGradCpuKernel() = default;
~LayerNormGradGradCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
static uint32_t LayerNormGradGradCheck(CpuKernelContext &ctx);
template <typename T>
static uint32_t LayerNormGradGradCompute(CpuKernelContext &ctx, size_t ParallelDataNums);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,256 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "log.h"
#include "cmath"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *kLog = "Log";
#define LOG_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = LogCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Log kernel compute failed."); \
return result; \
} \
break; \
}
#define LOG_COMPUTE_CASE2(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = LogCompute2(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Log kernel compute failed."); \
return result; \
} \
break; \
}
#define LOG_COMPUTE_CASE3(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = LogCompute3<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Log kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t LogCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLog);
KERNEL_HANDLE_ERROR(LogCheck(ctx), "[%s] check params failed.", kLog);
DataType data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
LOG_COMPUTE_CASE2(DT_FLOAT16, Eigen::half, ctx)
LOG_COMPUTE_CASE(DT_FLOAT, float, ctx)
LOG_COMPUTE_CASE(DT_DOUBLE, double, ctx)
LOG_COMPUTE_CASE3(DT_COMPLEX64, std::complex<float>, ctx)
LOG_COMPUTE_CASE3(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("Log kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t LogCpuKernel::LogCheck(CpuKernelContext &ctx) {
auto input_0 = ctx.Input(0);
auto output_0 = ctx.Output(0);
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed")
KERNEL_CHECK_NULLPTR(input_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
std::vector<int64_t> shape_x = input_0->GetTensorShape()->GetDimSizes();
size_t shape_size = shape_x.size();
KERNEL_CHECK_FALSE((shape_size > 0), KERNEL_STATUS_PARAM_INVALID, "Input must be at least rank 1, got [%zu].",
shape_x.size())
KERNEL_CHECK_FALSE((shape_x[shape_size - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
"Input last dimension must be at least 1.")
AttrValue *base_ptr = ctx.GetAttr("base");
KERNEL_CHECK_NULLPTR(base_ptr, KERNEL_STATUS_PARAM_INVALID, "Get attr base failed.");
float base_ = base_ptr->GetFloat();
KERNEL_CHECK_FALSE(((base_ > 0 && base_ != 1.0) || base_ == -1.0), KERNEL_STATUS_PARAM_INVALID,
"Attr base must be -1.0 or base > 0 and base is not "
"equal to 1 , but got attr base[%lld]",
base_);
AttrValue *scale_ptr = ctx.GetAttr("scale");
KERNEL_CHECK_NULLPTR(scale_ptr, KERNEL_STATUS_PARAM_INVALID, "Get attr scale failed.");
AttrValue *shift_ptr = ctx.GetAttr("shift");
KERNEL_CHECK_NULLPTR(shift_ptr, KERNEL_STATUS_PARAM_INVALID, "Get attr shift failed.");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t LogCpuKernel::LogCompute(CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
AttrValue *base_ptr = ctx.GetAttr("base");
T base_;
base_ = static_cast<T>(base_ptr->GetFloat());
if (base_ == static_cast<T>(-1.0)) {
base_ = static_cast<T>(exp(1.0));
}
AttrValue *scale_ptr = ctx.GetAttr("scale");
T scale_;
scale_ = static_cast<T>(scale_ptr->GetFloat());
AttrValue *shift_ptr = ctx.GetAttr("shift");
T shift_;
shift_ = static_cast<T>(shift_ptr->GetFloat());
size_t data_num = ctx.Input(0)->NumElements();
if (data_num <= 4 * 1024) {
for (size_t i = 0; i < data_num; i++) {
if (*(input_x + i) <= static_cast<T>(0)) {
KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
return KERNEL_STATUS_PARAM_INVALID;
}
*(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
}
} else {
uint32_t min_core_num = 1;
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_log = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
if (*(input_x + i) <= static_cast<T>(0)) {
KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
return KERNEL_STATUS_PARAM_INVALID;
}
*(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
}
return KERNEL_STATUS_PARAM_INVALID;
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log),
"Log Compute failed.");
}
return KERNEL_STATUS_OK;
}
uint32_t LogCpuKernel::LogCompute2(CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<Eigen::half *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<Eigen::half *>(ctx.Output(0)->GetData());
size_t data_num = ctx.Input(0)->NumElements();
for (uint64_t i = 0; i < data_num; i++) {
if (*(input_x + i) <= static_cast<Eigen::half>(0)) {
KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
return KERNEL_STATUS_PARAM_INVALID;
}
}
AttrValue *base_ptr = ctx.GetAttr("base");
Eigen::half base_;
base_ = static_cast<Eigen::half>(base_ptr->GetFloat());
if (base_ == static_cast<Eigen::half>(-1.0)) {
base_ = static_cast<Eigen::half>(exp(1.0));
}
AttrValue *scale_ptr = ctx.GetAttr("scale");
Eigen::half scale_;
scale_ = static_cast<Eigen::half>(scale_ptr->GetFloat());
AttrValue *shift_ptr = ctx.GetAttr("shift");
Eigen::half shift_;
shift_ = static_cast<Eigen::half>(shift_ptr->GetFloat());
typedef Eigen::Array<Eigen::half, Eigen::Dynamic, Eigen::Dynamic> ArrayxXd;
ArrayxXd array_x(1, data_num);
ArrayxXd array_y(1, data_num);
ArrayxXd array_z(1, 1);
for (size_t i = 0; i < data_num; i++) {
array_x(0, i) = *(input_x + i);
}
array_x = array_x * scale_;
array_x = array_x + shift_;
array_y = array_x.log();
array_z(0, 0) = base_;
array_z = array_z.log();
if (data_num <= 8 * 1024) {
for (size_t i = 0; i < data_num; i++) {
*(output_y + i) = array_y(0, i) / array_z(0, 0);
}
} else {
uint32_t min_core_num = 1;
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_log = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
*(output_y + i) = array_y(0, i) / array_z(0, 0);
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log),
"Log Compute failed.");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t LogCpuKernel::LogCompute3(CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
size_t data_num = ctx.Input(0)->NumElements();
AttrValue *base_ptr = ctx.GetAttr("base");
T base_;
base_ = static_cast<T>(base_ptr->GetFloat());
if (base_ == static_cast<T>(-1.0)) {
base_ = static_cast<T>(exp(1.0));
}
AttrValue *scale_ptr = ctx.GetAttr("scale");
T scale_;
scale_ = static_cast<T>(scale_ptr->GetFloat());
AttrValue *shift_ptr = ctx.GetAttr("shift");
T shift_;
shift_ = static_cast<T>(shift_ptr->GetFloat());
if (data_num <= 4 * 1024) {
for (size_t i = 0; i < data_num; i++) {
if (*(input_x + i) == static_cast<T>(0)) {
KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
return KERNEL_STATUS_PARAM_INVALID;
}
*(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
}
} else {
uint32_t min_core_num = 1;
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_log = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
if (*(input_x + i) == static_cast<T>(0)) {
KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
return KERNEL_STATUS_PARAM_INVALID;
}
*(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
}
return KERNEL_STATUS_PARAM_INVALID;
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log),
"Log Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLog, LogCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,42 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LOG_H_
#define AICPU_KERNELS_NORMALIZED_LOG_H_
#include "cpu_ops_kernel.h"
using namespace std;
namespace aicpu {
class LogCpuKernel : public CpuKernel {
public:
LogCpuKernel() = default;
~LogCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t LogCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t LogCompute(CpuKernelContext &ctx);
uint32_t LogCompute2(CpuKernelContext &ctx);
template <typename T>
uint32_t LogCompute3(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,160 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "logspace.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
constexpr uint32_t kLogSpaceInputNum = 2;
constexpr uint32_t kLogSpaceOutputNum = 1;
const char *kLogSpace = "LogSpace";
#define LOGSPACE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = LogSpaceCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("LogSpace kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t LogSpaceCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kLogSpaceInputNum, kLogSpaceOutputNum), "[%s] check input and output failed.",
kLogSpace);
KERNEL_HANDLE_ERROR(LogSpaceCheck(ctx), "[%s] check params failed.", kLogSpace);
DataType data_type = ctx.Output(0)->GetDataType();
switch (data_type) {
LOGSPACE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
LOGSPACE_COMPUTE_CASE(DT_FLOAT, float, ctx)
default:
KERNEL_LOG_ERROR("LogSpace kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t LogSpaceCpuKernel::LogSpaceCheck(CpuKernelContext &ctx) {
// get Attr steps_attr
AttrValue *steps_attr_ptr = ctx.GetAttr("steps");
if (steps_attr_ptr) {
int64_t steps_data = steps_attr_ptr->GetInt();
KERNEL_CHECK_FALSE((steps_data >= 0), KERNEL_STATUS_PARAM_INVALID,
"Attr [steps] data has to be greater than or equal to 0.");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t LogSpaceCpuKernel::LogSpaceCompute(CpuKernelContext &ctx) {
DataType data_type_in = ctx.Input(0)->GetDataType();
DataType data_type = ctx.Output(0)->GetDataType();
if (data_type_in == data_type) {
auto *input_start_ = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto *input_end_ = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto input_start = static_cast<double>(input_start_[0]);
auto input_end = static_cast<double>(input_end_[0]);
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
AttrValue *steps_data = ctx.GetAttr("steps");
AttrValue *base_data = ctx.GetAttr("base");
int64_t steps_value = 100;
int base_value = 10;
if (steps_data) {
steps_value = steps_data->GetInt();
}
if (base_data) {
base_value = base_data->GetInt();
}
if (steps_value != 1) {
double b = (input_end - input_start) / (steps_value - 1);
double q = pow(base_value, b);
double input_start_value = input_start;
for (int64_t i = 0; i < steps_value; i++) {
double end_num = pow(base_value, input_start_value) * pow(q, i);
*(output_y + i) = static_cast<T>(end_num);
}
}
if (steps_value == 1) {
double end_num = pow(base_value, double(input_start));
*(output_y) = static_cast<T>(end_num);
}
} else if (data_type_in == DT_FLOAT) {
auto *input_start_ = reinterpret_cast<float *>(ctx.Input(0)->GetData());
auto *input_end_ = reinterpret_cast<float *>(ctx.Input(1)->GetData());
auto input_start = static_cast<double>(input_start_[0]);
auto input_end = static_cast<double>(input_end_[0]);
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
AttrValue *steps_data = ctx.GetAttr("steps");
AttrValue *base_data = ctx.GetAttr("base");
int64_t steps_value = 100;
int base_value = 10;
if (steps_data) {
steps_value = steps_data->GetInt();
}
if (base_data) {
base_value = base_data->GetInt();
}
if (steps_value != 1) {
double b = (input_end - input_start) / (steps_value - 1);
double q = pow(base_value, b);
double input_start_value = input_start;
for (int64_t i = 0; i < steps_value; i++) {
double end_num = pow(base_value, input_start_value) * pow(q, i);
*(output_y + i) = static_cast<T>(end_num);
}
}
if (steps_value == 1) {
double end_num = pow(base_value, double(input_start));
*(output_y) = static_cast<T>(end_num);
}
} else if (data_type_in == DT_FLOAT16) {
auto *input_start_ = reinterpret_cast<Eigen::half *>(ctx.Input(0)->GetData());
auto *input_end_ = reinterpret_cast<Eigen::half *>(ctx.Input(1)->GetData());
auto input_start = static_cast<double>(input_start_[0]);
auto input_end = static_cast<double>(input_end_[0]);
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
AttrValue *steps_data = ctx.GetAttr("steps");
AttrValue *base_data = ctx.GetAttr("base");
int64_t steps_value = 100;
int base_value = 10;
if (steps_data) {
steps_value = steps_data->GetInt();
}
if (base_data) {
base_value = base_data->GetInt();
}
if (steps_value != 1) {
double b = (input_end - input_start) / (steps_value - 1);
double q = pow(base_value, b);
for (int64_t i = 0; i < steps_value; i++) {
double end_num = pow(base_value, input_start) * pow(q, i);
*(output_y + i) = static_cast<T>(end_num);
}
}
if (steps_value == 1) {
double end_num = pow(base_value, double(input_start));
*(output_y) = static_cast<T>(end_num);
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLogSpace, LogSpaceCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LOG1P_H_
#define AICPU_KERNELS_NORMALIZED_LOG1P_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class LogSpaceCpuKernel : public CpuKernel {
public:
LogSpaceCpuKernel() = default;
~LogSpaceCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t LogSpaceCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t LogSpaceCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,126 @@
/**
* Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "matrix_inverse.h"
#include <complex>
#include <vector>
#include "Eigen/Core"
#include "Eigen/LU"
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *kMatrixInverse = "MatrixInverse";
// if the data size is larger than the value, call ParallelFor() func
constexpr int64_t kParallelDataNums = 1 * 1024;
#define MATRIXINVERSE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = MatrixInverseCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("MatrixInverse kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t MatrixInverseCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixInverse check input and output number failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
MATRIXINVERSE_COMPUTE_CASE(DT_FLOAT, float, ctx)
MATRIXINVERSE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
MATRIXINVERSE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
MATRIXINVERSE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("MatrixInverse kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MatrixInverseCpuKernel::MatrixInverseCompute(CpuKernelContext &ctx) {
Tensor *input = ctx.Input(0);
T *input_ptr = reinterpret_cast<T *>(input->GetData());
Tensor *output = ctx.Output(0);
T *output_ptr = reinterpret_cast<T *>(output->GetData());
// Judge whether the input shape matches
auto shape = input->GetTensorShape();
uint64_t data_size = input->GetDataSize();
std::vector<int64_t> dims = shape->GetDimSizes();
KERNEL_CHECK_FALSE((dims.size() >= 2 && (*(dims.end() - 1) == *(dims.end() - 2))), KERNEL_STATUS_PARAM_INVALID,
"Input Shape is wrong");
auto last_dimsize = *(dims.end() - 1);
// Output length
auto input_num = input->NumElements();
size_t matrix_size = last_dimsize * last_dimsize;
// Number of matrices
size_t matrix_num = input_num / matrix_size;
// Store two-dimensional array of data for slicing
std::vector<std::vector<T>> temp(matrix_num, std::vector<T>(matrix_size));
for (size_t i = 0; i < matrix_num; i++) {
for (size_t j = 0; j < matrix_size; j++) {
temp[i][j] = *(input_ptr + i * matrix_size + j);
}
}
// Gets the value of the property adjoint
AttrValue *adjoint_attr = ctx.GetAttr("adjoint");
bool adjoint__ = adjoint_attr->GetBool();
if (data_size <= kParallelDataNums) {
for (size_t i = 0; i < matrix_num; i++) {
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_input(temp[i].data(), last_dimsize,
last_dimsize);
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_output(output_ptr + i * matrix_size,
last_dimsize, last_dimsize);
if (adjoint__) {
eigen_input = eigen_input.adjoint().eval();
}
Eigen::FullPivLU<Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>> lu(eigen_input);
eigen_output = lu.inverse();
}
} else {
uint32_t min_core_num = 1;
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > matrix_num) {
max_core_num = matrix_num;
}
auto sharedcompute = [&](size_t start, size_t end) {
for (auto i = start; i < end; i++) {
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_input(temp[i].data(), last_dimsize,
last_dimsize);
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_output(output_ptr + i * matrix_size,
last_dimsize, last_dimsize);
if (adjoint__) {
eigen_input = eigen_input.adjoint().eval();
}
Eigen::FullPivLU<Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>> lu(eigen_input);
eigen_output = lu.inverse();
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharedcompute),
"Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMatrixInverse, MatrixInverseCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MATRIXINVERSE_H_
#define AICPU_KERNELS_NORMALIZED_MATRIXINVERSE_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class MatrixInverseCpuKernel : public CpuKernel {
public:
MatrixInverseCpuKernel() = default;
~MatrixInverseCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
static uint32_t MatrixInverseCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,198 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "matrix_power.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "cpu_kernel_utils.h"
#include <Eigen/Dense>
#include <algorithm>
#include <iostream>
#include <map>
namespace {
const uint32_t kInputNum = 1;
const uint32_t kOutputNum = 1;
const char *kMatrixPower = "MatrixPower";
const int64_t kParallelDataNum = 4 * 1024;
} // namespace
namespace aicpu {
uint32_t MatrixPowerCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixPower normal check failed.");
auto x_type = ctx.Input(0)->GetDataType();
if (x_type == DT_FLOAT) {
return ComputeKernel<float>(ctx);
} else {
return ComputeKernel<Eigen::half>(ctx);
}
}
template <typename T>
uint32_t MatrixPowerCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
Tensor *input_x = ctx.Input(0);
Tensor *output_y = ctx.Output(0);
AttrValue *power = ctx.GetAttr("n");
int64_t powervalue = power->GetInt();
auto x_shape = input_x->GetTensorShape();
size_t batch = x_shape->GetDimSize(0);
size_t dim = x_shape->GetDimSize(1);
auto x_ptr = reinterpret_cast<T *>(input_x->GetData());
auto y_ptr = reinterpret_cast<T *>(output_y->GetData());
int64_t data_num = ctx.Input(0)->NumElements() * sizeof(T);
if (powervalue < 0) {
powervalue = -powervalue;
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > batch) {
max_core_num = batch;
}
if (max_core_num == 0) {
max_core_num = 1;
}
int64_t NotInvertible = -1;
auto shard_matrix_power = [&](size_t start, size_t end) {
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
for (size_t i = start; i < end; i++) {
for (size_t p = 0; p < dim; p++) {
for (size_t q = 0; q < dim; q++) {
B(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
}
}
Eigen::FullPivLU<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> LU(B);
if (!(LU.isInvertible())) {
NotInvertible = i;
}
A = LU.inverse();
B.setIdentity();
int64_t n = powervalue;
while (n > 0) {
if (n % 2 == 1) {
B = B * A;
}
n = n / 2;
A = A * A;
}
for (size_t p = 0; p < dim; p++) {
for (size_t q = 0; q < dim; q++) {
y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
}
}
}
};
CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, shard_matrix_power);
KERNEL_CHECK_FALSE((NotInvertible < 0), KERNEL_STATUS_PARAM_INVALID,
"The %d-th matrix of input tensor is singular, but got n is negative.", NotInvertible)
} else {
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
for (size_t i = 0; i < batch; i++) {
for (size_t p = 0; p < dim; p++) {
for (size_t q = 0; q < dim; q++) {
B(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
}
}
Eigen::FullPivLU<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> LU(B);
KERNEL_CHECK_FALSE((LU.isInvertible()), KERNEL_STATUS_PARAM_INVALID,
"The %d-th matrix of input tensor is singular, but got n is negative.", i)
A = LU.inverse();
B.setIdentity();
int64_t n = powervalue;
while (n > 0) {
if (n % 2 == 1) {
B = B * A;
}
n = n / 2;
A = A * A;
}
for (size_t p = 0; p < dim; p++) {
for (size_t q = 0; q < dim; q++) {
y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
}
}
}
}
} else {
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > batch) {
max_core_num = batch;
}
if (max_core_num == 0) {
max_core_num = 1;
}
auto shard_matrix_power = [&](size_t start, size_t end) {
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
for (size_t i = start; i < end; i++) {
for (size_t p = 0; p < dim; p++) {
for (size_t q = 0; q < dim; q++) {
A(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
}
}
B.setIdentity();
int64_t n = powervalue;
while (n > 0) {
if (n % 2 == 1) {
B = B * A;
}
n = n / 2;
A = A * A;
}
for (size_t p = 0; p < dim; p++) {
for (size_t q = 0; q < dim; q++) {
y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
}
}
}
};
CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, shard_matrix_power);
} else {
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
for (size_t i = 0; i < batch; i++) {
for (size_t p = 0; p < dim; p++) {
for (size_t q = 0; q < dim; q++) {
A(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
}
}
B.setIdentity();
int64_t n = powervalue;
while (n > 0) {
if (n % 2 == 1) {
B = B * A;
}
n = n / 2;
A = A * A;
}
for (size_t p = 0; p < dim; p++) {
for (size_t q = 0; q < dim; q++) {
y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
}
}
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMatrixPower, MatrixPowerCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,36 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_POWER_H_
#define AICPU_KERNELS_NORMALIZED_MATRIX_POWER_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class MatrixPowerCpuKernel : public CpuKernel {
public:
MatrixPowerCpuKernel() = default;
~MatrixPowerCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
static uint32_t ComputeKernel(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,160 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "matrix_solve.h"
#include <complex>
#include "Eigen/Core"
#include "Eigen/LU"
#include "unsupported/Eigen/CXX11/Tensor"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kMatrixSolve = "MatrixSolve";
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const int64_t kParallelDataNumSameShape = 8 * 1024;
const int64_t kParallelDataNumSameShapeMid = 128 * 1024;
} // namespace
namespace aicpu {
uint32_t MatrixSolveCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixSolve check input and output number failed.");
KERNEL_HANDLE_ERROR(MatrixSolveDataAndTypeCheck(ctx), "MatrixSolve check input and output params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
case DT_FLOAT:
return MatrixSolveCompute<float>(ctx);
case DT_DOUBLE:
return MatrixSolveCompute<double>(ctx);
case DT_COMPLEX64:
return MatrixSolveCompute<std::complex<float>>(ctx);
case DT_COMPLEX128:
return MatrixSolveCompute<std::complex<double>>(ctx);
default:
KERNEL_LOG_ERROR("MatrixSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MatrixSolveCpuKernel::MatrixSolveDataAndTypeCheck(CpuKernelContext &ctx) {
DataType matrix_type = ctx.Input(0)->GetDataType();
DataType rhs_type = ctx.Input(1)->GetDataType();
KERNEL_CHECK_FALSE((matrix_type == rhs_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(matrix_type).c_str(), DTypeStr(rhs_type).c_str())
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MatrixSolveCpuKernel::MatrixSolveCompute(CpuKernelContext &ctx) {
auto input0_tensor = ctx.Input(0);
auto input0_tensor_shape = input0_tensor->GetTensorShape();
auto input1_tensor = ctx.Input(1);
auto input1_tensor_shape = input1_tensor->GetTensorShape();
auto input0_data = reinterpret_cast<T *>(input0_tensor->GetData());
auto input1_data = reinterpret_cast<T *>(input1_tensor->GetData());
auto input0_shape = input0_tensor_shape->GetDimSizes();
int32_t input0_dims = input0_tensor_shape->GetDims();
int32_t input1_dims = input1_tensor_shape->GetDims();
int64_t m = input0_shape[input0_dims - 1];
int64_t size_mm = m * m;
KERNEL_CHECK_FALSE((input0_shape[input0_dims - 1] == input0_shape[input0_dims - 2]), KERNEL_STATUS_PARAM_INVALID,
"Input[matrix] must be a square matrix")
KERNEL_CHECK_FALSE((input1_dims >= 2), KERNEL_STATUS_PARAM_INVALID, "Input[rhs] must be a matrix")
KERNEL_CHECK_FALSE(
(input0_tensor_shape->GetDimSize(input0_dims - 1) == input1_tensor_shape->GetDimSize(input1_dims - 2)),
KERNEL_STATUS_PARAM_INVALID, "Input matrix and rhs are incompatible")
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartixXd;
auto adjoint = ctx.GetAttr("adjoint")->GetBool();
auto input1_shape = input1_tensor_shape->GetDimSizes();
int64_t k = input1_shape[input1_dims - 1];
auto output_tensor = ctx.Output(0);
auto output_data = reinterpret_cast<T *>(output_tensor->GetData());
if (size_mm > 0) {
size_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
if (data_size >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_size <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
// 若AI CPU中核数大于矩阵个数以矩阵个数作为max_core_num
if (max_core_num > matrix_num) {
max_core_num = matrix_num;
}
auto sharder_matrix_solve = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
Eigen::Map<MartixXd> input0(input0_data + i * m * m, m, m);
Eigen::Map<MartixXd> input1(input1_data + i * m * k, m, k);
Eigen::Map<MartixXd> output(output_data + i * m * k, m, k);
if (input0.rows() == 0 || input0.cols() == 0 || input1.cols() == 0) {
return KERNEL_STATUS_PARAM_INVALID;
}
Eigen::PartialPivLU<MartixXd> lu_decomposition(input0.rows());
if (adjoint) {
lu_decomposition.compute(input0.adjoint());
} else {
lu_decomposition.compute(input0);
}
using RealScalar = typename Eigen::NumTraits<T>::Real;
RealScalar pivot = lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
KERNEL_CHECK_FALSE((pivot > RealScalar(0)), KERNEL_STATUS_PARAM_INVALID, "Input matrix is not invertible");
output.noalias() = lu_decomposition.solve(input1);
}
return KERNEL_STATUS_OK;
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharder_matrix_solve),
"Matrix Solve Compute failed");
} else {
for (size_t i = 0; i < matrix_num; i++) {
Eigen::Map<MartixXd> input0(input0_data + i * m * m, m, m);
Eigen::Map<MartixXd> input1(input1_data + i * m * k, m, k);
Eigen::Map<MartixXd> output(output_data + i * m * k, m, k);
if (input0.rows() == 0 || input0.cols() == 0 || input1.cols() == 0) {
return KERNEL_STATUS_PARAM_INVALID;
}
Eigen::PartialPivLU<MartixXd> lu_decomposition(input0.rows());
if (adjoint) {
lu_decomposition.compute(input0.adjoint());
} else {
lu_decomposition.compute(input0);
}
using RealScalar = typename Eigen::NumTraits<T>::Real;
RealScalar pivot = lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
KERNEL_CHECK_FALSE((pivot > RealScalar(0)), KERNEL_STATUS_PARAM_INVALID, "Input matrix is not invertible");
output.noalias() = lu_decomposition.solve(input1);
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMatrixSolve, MatrixSolveCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MATRIXSOLVE_H_
#define AICPU_KERNELS_NORMALIZED_MATRIXSOLVE_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class MatrixSolveCpuKernel : public CpuKernel {
public:
MatrixSolveCpuKernel() = default;
~MatrixSolveCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t MatrixSolveDataAndTypeCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t MatrixSolveCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,315 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "max_pool_3d_grad_with_argmax.h"
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 3;
const char *kMaxPool3DGradWithArgmax = "MaxPool3DGradWithArgmax";
#define MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DTYPE, INTYPE, ARGTYPE, CTX) \
case (DTYPE): { \
uint32_t result = MaxPool3DGradWithArgmaxCompute<INTYPE, ARGTYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("MaxPool3DGradWithArgmax kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t MaxPool3DGradWithArgmaxCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
std::vector<std::string> attr_names = {"ksize", "strides", "pads", "dilation"};
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum, attr_names),
"MaxPool3DGradWithArgmax check input and output number failed.");
KERNEL_HANDLE_ERROR(MaxPool3DGradWithArgmaxParamCheck(ctx), "MaxPool3DGradWithArgmax check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
auto argmax_type = ctx.Input(2)->GetDataType();
if (argmax_type == DT_INT32) {
switch (data_type) {
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int32_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int32_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int32_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int32_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int32_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int32_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int32_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int32_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int32_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int32_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int32_t, ctx)
default:
KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel input data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
} else if (argmax_type == DT_INT64) {
switch (data_type) {
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int64_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int64_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int64_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int64_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int64_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int64_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int64_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int64_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int64_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int64_t, ctx)
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int64_t, ctx)
default:
KERNEL_LOG_ERROR("MaxPool3DGradWithArgmax kernel input data type [%s] not support.",
DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
} else {
KERNEL_LOG_ERROR(
"MaxPool3DGradWithArgmax kernel input_argmax data type [%s] not "
"support.",
DTypeStr(argmax_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxParamCheck(CpuKernelContext &ctx) {
auto input_x_info = ctx.Input(0);
auto input_grads_info = ctx.Input(1);
auto input_argmax_info = ctx.Input(2);
auto output_y_info = ctx.Output(0);
DataType input_x_type = input_x_info->GetDataType();
DataType input_grads_type = input_grads_info->GetDataType();
DataType out_type = output_y_info->GetDataType();
KERNEL_CHECK_FALSE((input_x_type == input_grads_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input x [%s] need be same with "
"input grads [%s].",
DTypeStr(input_x_type).c_str(), DTypeStr(input_grads_type).c_str())
KERNEL_CHECK_FALSE((input_x_type == out_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input x [%s] need be same with "
"output [%s].",
DTypeStr(input_x_type).c_str(), DTypeStr(out_type).c_str())
DataType input_argmax_type = input_argmax_info->GetDataType();
KERNEL_CHECK_FALSE((input_argmax_type == DT_INT32) || (input_argmax_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
"The data type of output argmax:[%s] should be a int32 or int64. ",
DTypeStr(input_argmax_type).c_str())
std::vector<int64_t> dim_vec = input_x_info->GetTensorShape()->GetDimSizes();
int64_t dimsize = dim_vec.size();
KERNEL_CHECK_FALSE(dimsize == 5, KERNEL_STATUS_PARAM_INVALID, "The dim of input:[%d] should be 5.", dimsize)
const size_t DIM_SIZE1 = 1;
const size_t DIM_SIZE3 = 3;
const size_t DIM_SIZE5 = 5;
AttrValue *attr_ksize = ctx.GetAttr("ksize");
std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
KERNEL_CHECK_FALSE(ksizeList.size() == DIM_SIZE1 || ksizeList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
"The size of ksize:[%d] should be 1 or 3.", ksizeList.size())
AttrValue *attr_strides = ctx.GetAttr("strides");
std::vector<int64_t> stridesList = attr_strides->GetListInt();
KERNEL_CHECK_FALSE(stridesList.size() == DIM_SIZE1 || stridesList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
"The size of strides:[%d] should be 1 or 3.", stridesList.size())
AttrValue *attr_pads = ctx.GetAttr("pads");
std::vector<int64_t> padsList = attr_pads->GetListInt();
KERNEL_CHECK_FALSE(padsList.size() == DIM_SIZE1 || padsList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
"The size of pads:[%d] should be 1 or 3.", padsList.size())
AttrValue *attr_dilation = ctx.GetAttr("dilation");
std::vector<int64_t> dilationList = attr_dilation->GetListInt();
KERNEL_CHECK_FALSE(
dilationList.size() == DIM_SIZE1 || dilationList.size() == DIM_SIZE3 || dilationList.size() == DIM_SIZE5,
KERNEL_STATUS_PARAM_INVALID, "The size of dilation:[%d] should be 1, 3 or 5.", dilationList.size())
KERNEL_LOG_DEBUG(
"MaxPool3DGradWithArgmaxCpuKernel[%s], input x: size[%llu];"
"input grads: size[%llu], input argmax: size[%llu], output y: "
"size[%lld].",
ctx.GetOpType().c_str(), input_x_info->GetDataSize(), input_grads_info->GetDataSize(),
input_argmax_info->GetDataSize(), output_y_info->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T, typename S>
void MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxSingleCompute(
T *input_grad, S *input_argmax, T *output_y, int64_t iD, int64_t iH, int64_t iW, int64_t oD, int64_t oH, int64_t oW,
int64_t kD, int64_t kH, int64_t kW, int64_t sD, int64_t sH, int64_t sW, int64_t pD, int64_t pH, int64_t pW,
int64_t dD, int64_t dH, int64_t dW) {
T *in_grad = input_grad;
T *out_y = output_y;
S *argmax = input_argmax;
/* calculate max points */
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t ti, i, j;
for (ti = 0; ti < oD; ti++) {
for (i = 0; i < oH; i++) {
for (j = 0; j < oW; j++) {
/* retrieve position of max */
int64_t index = ti * oH * oW + i * oW + j;
int64_t maxp = argmax[index];
if (maxp != -1) {
/* update gradient */
out_y[maxp] += in_grad[index];
}
}
}
}
}
template <typename T, typename S>
uint32_t MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxCompute(CpuKernelContext &ctx) {
auto input_x_info = ctx.Input(0);
auto input_grads_info = ctx.Input(1);
auto input_argmax_info = ctx.Input(2);
auto output_y_info = ctx.Output(0);
auto input_grads = reinterpret_cast<T *>(input_grads_info->GetData());
auto input_argmax = reinterpret_cast<S *>(input_argmax_info->GetData());
auto output_y = reinterpret_cast<T *>(output_y_info->GetData());
AttrValue *attr_ksize = ctx.GetAttr("ksize");
std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
AttrValue *attr_strides = ctx.GetAttr("strides");
std::vector<int64_t> stridesList = attr_strides->GetListInt();
AttrValue *attr_pads = ctx.GetAttr("pads");
std::vector<int64_t> padsList = attr_pads->GetListInt();
AttrValue *attr_dilation = ctx.GetAttr("dilation");
std::vector<int64_t> initList = {1, 1, 1, 1, 1};
std::vector<int64_t> dilationList = (attr_dilation == nullptr) ? initList : attr_dilation->GetListInt();
auto input_shape_vec = input_x_info->GetTensorShape()->GetDimSizes();
auto output_shape_vec = input_grads_info->GetTensorShape()->GetDimSizes();
const int64_t in_width = input_shape_vec[4];
const int64_t in_height = input_shape_vec[3];
const int64_t in_depth = input_shape_vec[2];
const int64_t in_channel = input_shape_vec[1];
const int64_t in_batch = input_shape_vec[0];
const int64_t out_width = output_shape_vec[4];
const int64_t out_height = output_shape_vec[3];
const int64_t out_depth = output_shape_vec[2];
const size_t DIM_SIZE1 = 1;
const size_t DIM_SIZE5 = 5;
std::vector<int64_t> ksizeTempList;
if (ksizeList.size() == DIM_SIZE1) {
ksizeTempList.push_back(ksizeList[0]);
ksizeTempList.push_back(ksizeList[0]);
ksizeTempList.push_back(ksizeList[0]);
} else {
ksizeTempList.push_back(ksizeList[0]);
ksizeTempList.push_back(ksizeList[1]);
ksizeTempList.push_back(ksizeList[2]);
}
std::vector<int64_t> stridesTempList;
if (stridesList.size() == DIM_SIZE1) {
stridesTempList.push_back(stridesList[0]);
stridesTempList.push_back(stridesList[0]);
stridesTempList.push_back(stridesList[0]);
} else {
stridesTempList.push_back(stridesList[0]);
stridesTempList.push_back(stridesList[1]);
stridesTempList.push_back(stridesList[2]);
}
std::vector<int64_t> padsTempList;
if (padsList.size() == DIM_SIZE1) {
padsTempList.push_back(padsList[0]);
padsTempList.push_back(padsList[0]);
padsTempList.push_back(padsList[0]);
} else {
padsTempList.push_back(padsList[0]);
padsTempList.push_back(padsList[1]);
padsTempList.push_back(padsList[2]);
}
std::vector<int64_t> dilationTempList;
if (dilationList.size() == DIM_SIZE1) {
dilationTempList.push_back(dilationList[0]);
dilationTempList.push_back(dilationList[0]);
dilationTempList.push_back(dilationList[0]);
} else if (dilationList.size() == DIM_SIZE5) {
dilationTempList.push_back(dilationList[2]);
dilationTempList.push_back(dilationList[3]);
dilationTempList.push_back(dilationList[4]);
} else {
dilationTempList.push_back(dilationList[1]);
dilationTempList.push_back(dilationList[2]);
dilationTempList.push_back(dilationList[3]);
}
const int64_t k_width = ksizeTempList[2];
const int64_t k_height = ksizeTempList[1];
const int64_t k_depth = ksizeTempList[0];
const int64_t s_width = stridesTempList[2];
const int64_t s_height = stridesTempList[1];
const int64_t s_depth = stridesTempList[0];
const int64_t p_width = padsTempList[2];
const int64_t p_height = padsTempList[1];
const int64_t p_depth = padsTempList[0];
const int64_t d_width = dilationTempList[2];
const int64_t d_height = dilationTempList[1];
const int64_t d_depth = dilationTempList[0];
KERNEL_CHECK_FALSE(k_width / 2 >= p_width && k_height / 2 >= p_height && k_depth / 2 >= p_depth,
KERNEL_STATUS_PARAM_INVALID, "pads should be smaller than or equal to half of kernel size.");
int64_t data_num = ctx.Input(0)->NumElements();
const int64_t batch = in_batch * in_channel;
const int64_t in_stride = in_width * in_height * in_depth;
const int64_t out_stride = out_width * out_height * out_depth;
const int64_t kParallelDataNum = 16 * in_width * in_height * in_depth;
const int64_t kParallelDataNumMid = 72 * in_width * in_height * in_depth;
const float ZERO = 0.f;
int64_t output_num = ctx.Output(0)->NumElements();
for (int64_t i = 0; i < output_num; i++) {
output_y[i] = static_cast<T>(ZERO);
}
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
auto sharder_max_pool3d_grad_with_argmax = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
MaxPool3DGradWithArgmaxSingleCompute(input_grads + i * out_stride, input_argmax + i * out_stride,
output_y + i * in_stride, in_depth, in_height, in_width, out_depth,
out_height, out_width, k_depth, k_height, k_width, s_depth, s_height,
s_width, p_depth, p_height, p_width, d_depth, d_height, d_width);
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_max_pool3d_grad_with_argmax),
"MaxPool3DGradWithArgmax Compute failed.");
} else {
for (int64_t i = 0; i < batch; i++) {
MaxPool3DGradWithArgmaxSingleCompute(input_grads + i * out_stride, input_argmax + i * out_stride,
output_y + i * in_stride, in_depth, in_height, in_width, out_depth,
out_height, out_width, k_depth, k_height, k_width, s_depth, s_height,
s_width, p_depth, p_height, p_width, d_depth, d_height, d_width);
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMaxPool3DGradWithArgmax, MaxPool3DGradWithArgmaxCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,45 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MAX_POOL3D_GRAD_WINTH_ARGMAX_H_
#define AICPU_KERNELS_NORMALIZED_MAX_POOL3D_GRAD_WINTH_ARGMAX_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class MaxPool3DGradWithArgmaxCpuKernel : public CpuKernel {
public:
MaxPool3DGradWithArgmaxCpuKernel() = default;
~MaxPool3DGradWithArgmaxCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t MaxPool3DGradWithArgmaxParamCheck(CpuKernelContext &ctx);
template <typename T, typename S>
uint32_t MaxPool3DGradWithArgmaxCompute(CpuKernelContext &ctx);
template <typename T, typename S>
void MaxPool3DGradWithArgmaxSingleCompute(T *input_x, S *input_argmax, T *output_y, int64_t iD, int64_t iH,
int64_t iW, int64_t oD, int64_t oH, int64_t oW, int64_t kD, int64_t kH,
int64_t kW, int64_t sD, int64_t sH, int64_t sW, int64_t pD, int64_t pH,
int64_t pW, int64_t dD, int64_t dH, int64_t dW);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,342 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "max_pool_3d_with_argmax.h"
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 2;
const uint32_t kInputNum = 1;
const char *kMaxPool3DWithArgmax = "MaxPool3DWithArgmax";
#define MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DTYPE, INTYPE, OUTTYPE, CTX) \
case (DTYPE): { \
uint32_t result = MaxPool3DWithArgmaxCompute<INTYPE, OUTTYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t MaxPool3DWithArgmaxCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
std::vector<std::string> attr_names = {"ksize", "strides", "pads"};
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum, attr_names),
"MaxPool3DWithArgmax check input and output number failed.");
KERNEL_HANDLE_ERROR(MaxPool3DWithArgmaxParamCheck(ctx), "MaxPool3DWithArgmax check params failed.");
auto in_data_type = ctx.Input(0)->GetDataType();
auto out_data_type = ctx.Output(1)->GetDataType();
std::string argmax_type =
(ctx.GetAttr("argmax_type") == nullptr) ? "bitmask" : ctx.GetAttr("argmax_type")->GetString();
if (argmax_type == "bitmask") {
KERNEL_LOG_ERROR("Bitmask is not supported now.");
return KERNEL_STATUS_PARAM_INVALID;
} else {
if (out_data_type == DT_INT32) {
switch (in_data_type) {
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int32_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int32_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int32_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int32_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int32_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int32_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int32_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int32_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int32_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int32_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int32_t, ctx)
default:
KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel input data type [%s] not support.",
DTypeStr(in_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
} else if (out_data_type == DT_INT64) {
switch (in_data_type) {
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int64_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int64_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int64_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int64_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int64_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int64_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int64_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int64_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int64_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int64_t, ctx)
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int64_t, ctx)
default:
KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel input data type [%s] not support.",
DTypeStr(in_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
} else {
KERNEL_LOG_ERROR(
"MaxPool3DWithArgmax kernel output_argmax data type [%s] not "
"support.",
DTypeStr(out_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
}
uint32_t MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxParamCheck(CpuKernelContext &ctx) {
auto input_info = ctx.Input(0);
auto output_y_info = ctx.Output(0);
auto output_argmax_info = ctx.Output(1);
DataType input_type = input_info->GetDataType();
DataType output_y_type = output_y_info->GetDataType();
KERNEL_CHECK_FALSE((input_type == output_y_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input x [%s] need be same with "
"output y [%s].",
DTypeStr(input_type).c_str(), DTypeStr(output_y_type).c_str())
DataType output_argmax_type = output_argmax_info->GetDataType();
KERNEL_CHECK_FALSE((output_argmax_type == DT_INT32) || (output_argmax_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
"The data type of output argmax:[%s] should be a int32 or int64. ",
DTypeStr(output_argmax_type).c_str())
std::vector<int64_t> dim_vec = input_info->GetTensorShape()->GetDimSizes();
int64_t dimsize = dim_vec.size();
KERNEL_CHECK_FALSE(dimsize == 5, KERNEL_STATUS_PARAM_INVALID, "The dim of input:[%d] should be 5.", dimsize)
const size_t DIM_SIZE1 = 1;
const size_t DIM_SIZE3 = 3;
const size_t DIM_SIZE5 = 5;
AttrValue *attr_ksize = ctx.GetAttr("ksize");
std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
KERNEL_CHECK_FALSE(ksizeList.size() == DIM_SIZE1 || ksizeList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
"The size of ksize:[%d] should be 1 or 3.", ksizeList.size())
AttrValue *attr_strides = ctx.GetAttr("strides");
std::vector<int64_t> stridesList = attr_strides->GetListInt();
KERNEL_CHECK_FALSE(stridesList.size() == DIM_SIZE1 || stridesList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
"The size of strides:[%d] should be 1 or 3.", stridesList.size())
AttrValue *attr_pads = ctx.GetAttr("pads");
std::vector<int64_t> padsList = attr_pads->GetListInt();
KERNEL_CHECK_FALSE(padsList.size() == DIM_SIZE1 || padsList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
"The size of pads:[%d] should be 1 or 3.", padsList.size())
AttrValue *attr_dilation = ctx.GetAttr("dilation");
std::vector<int64_t> initList = {1, 1, 1, 1, 1};
std::vector<int64_t> dilationList = (attr_dilation == nullptr) ? initList : attr_dilation->GetListInt();
KERNEL_CHECK_FALSE(
dilationList.size() == DIM_SIZE1 || dilationList.size() == DIM_SIZE3 || dilationList.size() == DIM_SIZE5,
KERNEL_STATUS_PARAM_INVALID, "The size of dilation:[%d] should be 1, 3 or 5.", dilationList.size())
KERNEL_LOG_DEBUG(
"MaxPool3sWithArgmaxCpuKernel[%s], input x: size[%llu];"
"output y: size[%llu], output argmax: size[%llu].",
ctx.GetOpType().c_str(), input_info->GetDataSize(), output_y_info->GetDataSize(),
output_argmax_info->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T, typename S>
void MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxSingleCompute(T *input, T *output_y, S *output_argmax, int64_t iD,
int64_t iH, int64_t iW, int64_t oD, int64_t oH,
int64_t oW, int64_t kD, int64_t kH, int64_t kW,
int64_t sD, int64_t sH, int64_t sW, int64_t pD,
int64_t pH, int64_t pW, int64_t dD, int64_t dH,
int64_t dW) {
int64_t i, j, ti;
T *ip = input;
for (ti = 0; ti < oD; ti++) {
for (i = 0; i < oH; i++) {
for (j = 0; j < oW; j++) {
int64_t start_t = ti * sD - pD;
int64_t start_h = i * sH - pH;
int64_t start_w = j * sW - pW;
int64_t end_t = std::min(start_t + (kD - 1) * dD + 1, iD);
int64_t end_h = std::min(start_h + (kH - 1) * dH + 1, iH);
int64_t end_w = std::min(start_w + (kW - 1) * dW + 1, iW);
while (start_t < 0) {
start_t += dD;
}
while (start_h < 0) {
start_h += dH;
}
while (start_w < 0) {
start_w += dW;
}
T *op = output_y + ti * oW * oH + i * oW + j;
S *indzp = output_argmax + ti * oW * oH + i * oW + j;
S maxindex = start_t * iH * iW + start_h * iW + start_w;
T maxval = -std::numeric_limits<T>::infinity();
for (int64_t z = start_t; z < end_t; z += dD) {
for (int64_t y = start_h; y < end_h; y += dH) {
for (int64_t x = start_w; x < end_w; x += dW) {
S index = z * iH * iW + y * iW + x;
T val = ip[index];
if ((val > maxval) || std::isnan(double(val))) {
maxval = (T)val;
maxindex = index;
}
}
}
}
// store location of max
*indzp = maxindex;
/* set output to local max */
*op = maxval;
}
}
}
}
template <typename T, typename S>
uint32_t MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxCompute(CpuKernelContext &ctx) {
auto input_info = ctx.Input(0);
auto output_y_info = ctx.Output(0);
auto output_argmax_info = ctx.Output(1);
auto input_x = reinterpret_cast<T *>(input_info->GetData());
auto output_y = reinterpret_cast<T *>(output_y_info->GetData());
auto output_argmax = reinterpret_cast<S *>(output_argmax_info->GetData());
AttrValue *attr_ksize = ctx.GetAttr("ksize");
std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
AttrValue *attr_strides = ctx.GetAttr("strides");
std::vector<int64_t> stridesList = attr_strides->GetListInt();
AttrValue *attr_pads = ctx.GetAttr("pads");
std::vector<int64_t> padsList = attr_pads->GetListInt();
AttrValue *attr_dilation = ctx.GetAttr("dilation");
std::vector<int64_t> initList = {1, 1, 1, 1, 1};
std::vector<int64_t> dilationList = (attr_dilation == nullptr) ? initList : attr_dilation->GetListInt();
auto input_shape_vec = input_info->GetTensorShape()->GetDimSizes();
auto output_shape_vec = output_y_info->GetTensorShape()->GetDimSizes();
const int64_t in_width = input_shape_vec[4];
const int64_t in_height = input_shape_vec[3];
const int64_t in_depth = input_shape_vec[2];
const int64_t in_channel = input_shape_vec[1];
const int64_t in_batch = input_shape_vec[0];
const int64_t out_width = output_shape_vec[4];
const int64_t out_height = output_shape_vec[3];
const int64_t out_depth = output_shape_vec[2];
const size_t DIM_SIZE1 = 1;
const size_t DIM_SIZE5 = 5;
std::vector<int64_t> ksizeTempList;
if (ksizeList.size() == DIM_SIZE1) {
ksizeTempList.push_back(ksizeList[0]);
ksizeTempList.push_back(ksizeList[0]);
ksizeTempList.push_back(ksizeList[0]);
} else {
ksizeTempList.push_back(ksizeList[0]);
ksizeTempList.push_back(ksizeList[1]);
ksizeTempList.push_back(ksizeList[2]);
}
std::vector<int64_t> stridesTempList;
if (stridesList.size() == DIM_SIZE1) {
stridesTempList.push_back(stridesList[0]);
stridesTempList.push_back(stridesList[0]);
stridesTempList.push_back(stridesList[0]);
} else {
stridesTempList.push_back(stridesList[0]);
stridesTempList.push_back(stridesList[1]);
stridesTempList.push_back(stridesList[2]);
}
std::vector<int64_t> padsTempList;
if (padsList.size() == DIM_SIZE1) {
padsTempList.push_back(padsList[0]);
padsTempList.push_back(padsList[0]);
padsTempList.push_back(padsList[0]);
} else {
padsTempList.push_back(padsList[0]);
padsTempList.push_back(padsList[1]);
padsTempList.push_back(padsList[2]);
}
std::vector<int64_t> dilationTempList;
if (dilationList.size() == DIM_SIZE1) {
dilationTempList.push_back(dilationList[0]);
dilationTempList.push_back(dilationList[0]);
dilationTempList.push_back(dilationList[0]);
} else if (dilationList.size() == DIM_SIZE5) {
dilationTempList.push_back(dilationList[2]);
dilationTempList.push_back(dilationList[3]);
dilationTempList.push_back(dilationList[4]);
} else {
dilationTempList.push_back(dilationList[0]);
dilationTempList.push_back(dilationList[1]);
dilationTempList.push_back(dilationList[2]);
}
const int64_t k_width = ksizeTempList[2];
const int64_t k_height = ksizeTempList[1];
const int64_t k_depth = ksizeTempList[0];
const int64_t s_width = stridesTempList[2];
const int64_t s_height = stridesTempList[1];
const int64_t s_depth = stridesTempList[0];
const int64_t p_width = padsTempList[2];
const int64_t p_height = padsTempList[1];
const int64_t p_depth = padsTempList[0];
const int64_t d_width = dilationTempList[2];
const int64_t d_height = dilationTempList[1];
const int64_t d_depth = dilationTempList[0];
KERNEL_CHECK_FALSE(k_width / 2 >= p_width && k_height / 2 >= p_height && k_depth / 2 >= p_depth,
KERNEL_STATUS_PARAM_INVALID, "pads should be smaller than or equal to half of kernel size.");
int64_t data_num = ctx.Input(0)->NumElements();
const int64_t batch = in_batch * in_channel;
const int64_t in_stride = in_width * in_height * in_depth;
const int64_t out_stride = out_width * out_height * out_depth;
const int64_t kParallelDataNum = 16 * in_width * in_height * in_depth;
const int64_t kParallelDataNumMid = 72 * in_width * in_height * in_depth;
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
auto sharder_max_pool3d_with_argmax = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
MaxPool3DWithArgmaxSingleCompute(input_x + i * in_stride, output_y + i * out_stride,
output_argmax + i * out_stride, in_depth, in_height, in_width, out_depth,
out_height, out_width, k_depth, k_height, k_width, s_depth, s_height, s_width,
p_depth, p_height, p_width, d_depth, d_height, d_width);
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_max_pool3d_with_argmax),
"MaxPool3DWithArgmax Compute failed.");
} else {
for (int64_t i = 0; i < batch; i++) {
MaxPool3DWithArgmaxSingleCompute(input_x + i * in_stride, output_y + i * out_stride,
output_argmax + i * out_stride, in_depth, in_height, in_width, out_depth,
out_height, out_width, k_depth, k_height, k_width, s_depth, s_height, s_width,
p_depth, p_height, p_width, d_depth, d_height, d_width);
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMaxPool3DWithArgmax, MaxPool3DWithArgmaxCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,46 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MAX_POOL3D_WINTH_ARGMAX_H_
#define AICPU_KERNELS_NORMALIZED_MAX_POOL3D_WINTH_ARGMAX_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class MaxPool3DWithArgmaxCpuKernel : public CpuKernel {
public:
MaxPool3DWithArgmaxCpuKernel() = default;
~MaxPool3DWithArgmaxCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t MaxPool3DWithArgmaxParamCheck(CpuKernelContext &ctx);
template <typename T, typename S>
uint32_t MaxPool3DWithArgmaxCompute(CpuKernelContext &ctx);
template <typename T, typename S>
void MaxPool3DWithArgmaxSingleCompute(T *input, T *output_y, S *output_argmax, int64_t iD, int64_t iH, int64_t iW,
int64_t oD, int64_t oH, int64_t oW, int64_t kD, int64_t kH, int64_t kW,
int64_t sD, int64_t sH, int64_t sW, int64_t pD, int64_t pH, int64_t pW,
int64_t dD, int64_t dH, int64_t dW);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,235 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "max_unpool_2d.h"
#include <cmath>
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
constexpr int64_t kParallelDataNums = 1024;
const char *kMaxUnpool2D = "MaxUnpool2D";
#define SWITCH_PARALLEL(SHARD, end_num, ctx) \
if (end_num <= kParallelDataNums) { \
for (size_t i = 0; i < size_t(end_num); i++) { \
SHARD(i, i + 1); \
} \
} else { \
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), "MaxUnpool2D #SHARD Compute failed."); \
}
} // namespace
namespace aicpu {
template <typename DATA_T>
uint32_t MaxUnpool2DCpuKernel::MaxUnpool2D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
// Compute by indices_type
switch (indices_type) {
case DT_INT32:
return MaxUnpool2DCompute<DATA_T, int32_t>(ctx);
case DT_INT64:
return MaxUnpool2DCompute<DATA_T, int64_t>(ctx);
default:
KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
uint32_t MaxUnpool2DCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool2D check input and output number failed.");
KERNEL_HANDLE_ERROR(MaxUnpool2DCheck(ctx), "MaxUnpool2D check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
auto indices_type = ctx.Input(1)->GetDataType();
switch (data_type) {
case DT_INT8:
return MaxUnpool2D_COMPUTE_CASE<int8_t>(ctx, indices_type);
case DT_INT16:
return MaxUnpool2D_COMPUTE_CASE<int16_t>(ctx, indices_type);
case DT_INT32:
return MaxUnpool2D_COMPUTE_CASE<int32_t>(ctx, indices_type);
case DT_INT64:
return MaxUnpool2D_COMPUTE_CASE<int64_t>(ctx, indices_type);
case DT_UINT8:
return MaxUnpool2D_COMPUTE_CASE<uint8_t>(ctx, indices_type);
case DT_UINT16:
return MaxUnpool2D_COMPUTE_CASE<uint16_t>(ctx, indices_type);
case DT_UINT32:
return MaxUnpool2D_COMPUTE_CASE<uint32_t>(ctx, indices_type);
case DT_UINT64:
return MaxUnpool2D_COMPUTE_CASE<uint64_t>(ctx, indices_type);
case DT_FLOAT16:
return MaxUnpool2D_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
case DT_FLOAT:
return MaxUnpool2D_COMPUTE_CASE<float>(ctx, indices_type);
case DT_DOUBLE:
return MaxUnpool2D_COMPUTE_CASE<double>(ctx, indices_type);
default:
KERNEL_LOG_ERROR("MaxUnpool2D kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MaxUnpool2DCpuKernel::MaxUnpool2DCheck(CpuKernelContext &ctx) {
DataType input0Type = ctx.Input(0)->GetDataType();
DataType outputType = ctx.Output(0)->GetDataType();
KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
"The data type of output [%d] need be same with "
"input0 [%d].",
outputType, input0Type)
KERNEL_LOG_INFO(
"MaxUnpool2DCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename DATA_T, typename INDICES_T>
uint32_t MaxUnpool2DCpuKernel::MaxUnpool2DCompute(CpuKernelContext &ctx) {
Tensor *input = ctx.Input(0);
Tensor *indices = ctx.Input(1);
Tensor *output = ctx.Output(0);
std::string dataFormat = "NCHW";
if (ctx.GetAttr("data_format") != nullptr) {
dataFormat = ctx.GetAttr("data_format")->GetString();
}
int32_t NIndex, CIndex, HIndex, WIndex;
bool error = false;
if (dataFormat == "NHWC") {
NIndex = 0;
CIndex = 3;
HIndex = 1;
WIndex = 2;
auto inputShape = input->GetTensorShape();
int64_t numBatch = inputShape->GetDimSize(NIndex);
int64_t inputHeight = inputShape->GetDimSize(HIndex);
int64_t inputWidth = inputShape->GetDimSize(WIndex);
int64_t numChannels = inputShape->GetDimSize(CIndex);
auto output_shape = output->GetTensorShape();
int64_t oheight = output_shape->GetDimSize(HIndex);
int64_t owidth = output_shape->GetDimSize(WIndex);
auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
for (int s = 0; s < numBatch * oheight * owidth * numChannels; s++) {
rawOutput[s] = (DATA_T)0;
}
auto shard = [&](int64_t start, int64_t end) {
for (int64_t n = start; n < end; n++) {
int64_t nOutputOffset = n * numChannels * owidth * oheight;
int64_t nInputOffset = n * numChannels * inputWidth * inputHeight;
DATA_T *output_p_k = rawOutput + nOutputOffset;
DATA_T *input_p_k = rawInput + nInputOffset;
INDICES_T *ind_p_k = rawIndices + nInputOffset;
int64_t maxp;
for (int64_t k = 0; k < numChannels; k++) {
for (int64_t i = 0; i < inputHeight; i++) {
for (int64_t j = 0; j < inputWidth; j++) {
maxp = ind_p_k[i * inputWidth * numChannels + j * numChannels + k];
if (maxp < 0 || maxp >= owidth * oheight) {
error = true;
KERNEL_LOG_ERROR(
"MaxUnpool2D: output_size H_out * W_out "
"should be bigger than argmax, now H_out is [%ld], "
"and W_out is [%ld], but one of the values in argmax is "
"[%ld].",
oheight, owidth, maxp);
} else {
output_p_k[maxp * numChannels + k] = input_p_k[i * inputWidth * numChannels + j * numChannels + k];
}
}
}
}
}
};
SWITCH_PARALLEL(shard, numBatch, ctx);
} else {
NIndex = 0;
CIndex = 1;
HIndex = 2;
WIndex = 3;
auto inputShape = input->GetTensorShape();
int64_t numBatch = inputShape->GetDimSize(NIndex);
int64_t inputHeight = inputShape->GetDimSize(HIndex);
int64_t inputWidth = inputShape->GetDimSize(WIndex);
int64_t numChannels = inputShape->GetDimSize(CIndex);
auto output_shape = output->GetTensorShape();
int64_t oheight = output_shape->GetDimSize(HIndex);
int64_t owidth = output_shape->GetDimSize(WIndex);
auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
for (int s = 0; s < numBatch * oheight * owidth * numChannels; s++) {
rawOutput[s] = (DATA_T)0;
}
auto shard = [&](int64_t start, int64_t end) {
for (int64_t n = start; n < end; n++) {
int64_t nOutputOffset = n * numChannels * owidth * oheight;
int64_t nInputOffset = n * numChannels * inputWidth * inputHeight;
int64_t k = 0;
for (k = 0; k < numChannels; k++) {
int64_t finalOutputOffset = nOutputOffset + k * owidth * oheight;
int64_t finalInputOffset = nInputOffset + k * inputWidth * inputHeight;
DATA_T *output_p_k = rawOutput + finalOutputOffset;
DATA_T *input_p_k = rawInput + finalInputOffset;
INDICES_T *ind_p_k = rawIndices + finalInputOffset;
int64_t maxp;
for (int64_t i = 0; i < inputHeight; i++) {
for (int64_t j = 0; j < inputWidth; j++) {
maxp = ind_p_k[i * inputWidth + j];
if (maxp < 0 || maxp >= owidth * oheight) {
error = true;
KERNEL_LOG_ERROR(
"MaxUnpool2D: output_size H_out * W_out "
"should be bigger than argmax, now H_out is [%ld], "
"and W_out is [%ld], but one of the values in argmax is "
"[%ld].",
oheight, owidth, maxp);
} else {
output_p_k[maxp] = input_p_k[i * inputWidth + j];
}
}
}
}
}
};
SWITCH_PARALLEL(shard, numBatch, ctx);
}
if (error == true) {
return KERNEL_STATUS_PARAM_INVALID;
} else {
return KERNEL_STATUS_OK;
}
}
REGISTER_CPU_KERNEL(kMaxUnpool2D, MaxUnpool2DCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_H_
#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class MaxUnpool2DCpuKernel : public CpuKernel {
public:
MaxUnpool2DCpuKernel() = default;
~MaxUnpool2DCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
static uint32_t MaxUnpool2DCheck(CpuKernelContext &ctx);
template <typename T>
static uint32_t MaxUnpool2D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
template <typename T, typename S>
static uint32_t MaxUnpool2DCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_H_

View File

@ -0,0 +1,247 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "max_unpool_2d_grad.h"
#include <cmath>
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 3;
constexpr int64_t kParallelDataNums = 1024;
const char *kMaxUnpool2DGrad = "MaxUnpool2DGrad";
#define SWITCH_PARALLEL(SHARD, end_num, ctx) \
if (end_num <= kParallelDataNums) { \
for (size_t i = 0; i < size_t(end_num); i++) { \
SHARD(i, i + 1); \
} \
} else { \
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), \
"MaxUnpool2DGrad #SHARD Compute failed."); \
}
} // namespace
namespace aicpu {
template <typename DATA_T>
uint32_t MaxUnpool2DGradCpuKernel::MaxUnpool2DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
// Compute by indices_type
switch (indices_type) {
case DT_INT32:
return MaxUnpool2DGradCompute<DATA_T, int32_t>(ctx);
case DT_INT64:
return MaxUnpool2DGradCompute<DATA_T, int64_t>(ctx);
default:
KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
uint32_t MaxUnpool2DGradCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool2DGrad check input and output number failed.");
KERNEL_HANDLE_ERROR(MaxUnpool2DGradCheck(ctx), "MaxUnpool2DGrad check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
auto indices_type = ctx.Input(2)->GetDataType();
switch (data_type) {
case DT_INT8:
return MaxUnpool2DGrad_COMPUTE_CASE<int8_t>(ctx, indices_type);
case DT_INT16:
return MaxUnpool2DGrad_COMPUTE_CASE<int16_t>(ctx, indices_type);
case DT_INT32:
return MaxUnpool2DGrad_COMPUTE_CASE<int32_t>(ctx, indices_type);
case DT_INT64:
return MaxUnpool2DGrad_COMPUTE_CASE<int64_t>(ctx, indices_type);
case DT_UINT8:
return MaxUnpool2DGrad_COMPUTE_CASE<uint8_t>(ctx, indices_type);
case DT_UINT16:
return MaxUnpool2DGrad_COMPUTE_CASE<uint16_t>(ctx, indices_type);
case DT_UINT32:
return MaxUnpool2DGrad_COMPUTE_CASE<uint32_t>(ctx, indices_type);
case DT_UINT64:
return MaxUnpool2DGrad_COMPUTE_CASE<uint64_t>(ctx, indices_type);
case DT_FLOAT16:
return MaxUnpool2DGrad_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
case DT_FLOAT:
return MaxUnpool2DGrad_COMPUTE_CASE<float>(ctx, indices_type);
case DT_DOUBLE:
return MaxUnpool2DGrad_COMPUTE_CASE<double>(ctx, indices_type);
default:
KERNEL_LOG_ERROR("MaxUnpool2DGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MaxUnpool2DGradCpuKernel::MaxUnpool2DGradCheck(CpuKernelContext &ctx) {
DataType input0Type = ctx.Input(0)->GetDataType();
DataType input1Type = ctx.Input(1)->GetDataType();
DataType outputType = ctx.Output(0)->GetDataType();
KERNEL_CHECK_FALSE((input0Type == input1Type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input1Type [%d] need be same with "
"input0 [%d].",
input1Type, input0Type)
KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
"The data type of output [%d] need be same with "
"input0 [%d].",
outputType, input0Type)
auto Input0_size = ctx.Input(0)->GetTensorShape()->GetDimSizes();
auto Input2_size = ctx.Input(2)->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE((Input0_size == Input2_size), KERNEL_STATUS_PARAM_INVALID,
"The data size of x [%d] need be same with "
"input argmax [%d].",
Input0_size, Input2_size)
KERNEL_LOG_INFO(
"MaxUnpool2DGradCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], input2: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Input(2)->GetDataSize(),
ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename DATA_T, typename INDICES_T>
uint32_t MaxUnpool2DGradCpuKernel::MaxUnpool2DGradCompute(CpuKernelContext &ctx) {
Tensor *grads = ctx.Input(1);
Tensor *indices = ctx.Input(2);
Tensor *output = ctx.Output(0);
std::string dataFormat = "NCHW";
if (ctx.GetAttr("data_format") != nullptr) {
dataFormat = ctx.GetAttr("data_format")->GetString();
}
int32_t NIndex, CIndex, HIndex, WIndex;
bool error = false;
if (dataFormat == "NHWC") {
NIndex = 0;
CIndex = 3;
HIndex = 1;
WIndex = 2;
auto grads_out_shape = grads->GetTensorShape();
int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
int64_t oheight = grads_out_shape->GetDimSize(HIndex);
int64_t owidth = grads_out_shape->GetDimSize(WIndex);
int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
auto output_shape = output->GetTensorShape();
int64_t iheight = output_shape->GetDimSize(HIndex);
int64_t iwidth = output_shape->GetDimSize(WIndex);
auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
for (int s = 0; s < numBatch * iheight * iwidth * numChannels; s++) {
rawOutput[s] = (DATA_T)0;
}
auto shard = [&](int64_t start, int64_t end) {
for (int64_t n = start; n < end; n++) {
int64_t nOutputOffset = n * numChannels * iwidth * iheight;
int64_t nGradsOffset = n * numChannels * owidth * oheight;
DATA_T *output_p_k = rawOutput + nOutputOffset;
DATA_T *grads_p_k = rawGrads + nGradsOffset;
INDICES_T *ind_p_k = rawIndices + nOutputOffset;
int64_t maxp;
for (int64_t k = 0; k < numChannels; k++) {
for (int64_t i = 0; i < iheight; i++) {
for (int64_t j = 0; j < iwidth; j++) {
maxp = ind_p_k[i * iwidth * numChannels + j * numChannels + k];
if (maxp < 0 || maxp >= owidth * oheight) {
error = true;
KERNEL_LOG_ERROR(
"MaxUnpool2DGrad: output_size H_out * W_out "
"should be bigger than argmax, now H_out is [%ld], "
"and W_out is [%ld], but one of the values in argmax is "
"[%ld].",
oheight, owidth, maxp);
} else {
output_p_k[i * iwidth * numChannels + j * numChannels + k] = grads_p_k[maxp * numChannels + k];
}
}
}
}
}
};
SWITCH_PARALLEL(shard, numBatch, ctx);
} else {
NIndex = 0;
CIndex = 1;
HIndex = 2;
WIndex = 3;
auto grads_out_shape = grads->GetTensorShape();
int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
int64_t oheight = grads_out_shape->GetDimSize(HIndex);
int64_t owidth = grads_out_shape->GetDimSize(WIndex);
int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
auto output_shape = output->GetTensorShape();
int64_t iheight = output_shape->GetDimSize(HIndex);
int64_t iwidth = output_shape->GetDimSize(WIndex);
auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
for (int s = 0; s < numBatch * iheight * iwidth * numChannels; s++) {
rawOutput[s] = (DATA_T)0;
}
auto shard = [&](int64_t start, int64_t end) {
for (int64_t n = start; n < end; n++) {
int64_t nOutputOffset = n * numChannels * iwidth * iheight;
int64_t nGradsOffset = n * numChannels * owidth * oheight;
int64_t k = 0;
for (k = 0; k < numChannels; k++) {
int64_t finalOutputOffset = nOutputOffset + k * iwidth * iheight;
int64_t finalGradsOffset = nGradsOffset + k * owidth * oheight;
DATA_T *output_p_k = rawOutput + finalOutputOffset;
DATA_T *grads_p_k = rawGrads + finalGradsOffset;
INDICES_T *ind_p_k = rawIndices + finalOutputOffset;
int64_t maxp;
for (int64_t i = 0; i < iheight; i++) {
for (int64_t j = 0; j < iwidth; j++) {
maxp = ind_p_k[i * iwidth + j];
if (maxp < 0 || maxp >= owidth * oheight) {
error = true;
KERNEL_LOG_ERROR(
"MaxUnpool2DGrad: output_size H_out * W_out "
"should be bigger than argmax, now H_out is [%ld], "
"and W_out is [%ld], but one of the values in argmax is "
"[%ld].",
oheight, owidth, maxp);
} else {
output_p_k[i * iwidth + j] = grads_p_k[maxp];
}
}
}
}
}
};
SWITCH_PARALLEL(shard, numBatch, ctx);
}
if (error == true) {
return KERNEL_STATUS_PARAM_INVALID;
} else {
return KERNEL_STATUS_OK;
}
}
REGISTER_CPU_KERNEL(kMaxUnpool2DGrad, MaxUnpool2DGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_GRAD_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class MaxUnpool2DGradCpuKernel : public CpuKernel {
public:
MaxUnpool2DGradCpuKernel() = default;
~MaxUnpool2DGradCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
static uint32_t MaxUnpool2DGradCheck(CpuKernelContext &ctx);
template <typename T>
static uint32_t MaxUnpool2DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
template <typename T, typename S>
static uint32_t MaxUnpool2DGradCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_GRAD_H_

View File

@ -0,0 +1,247 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "max_unpool_3d.h"
#include <cmath>
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
constexpr int64_t kParallelDataNums = 1024;
const char *kMaxUnpool3D = "MaxUnpool3D";
#define SWITCH_PARALLEL(SHARD, end_num, ctx) \
if (end_num <= kParallelDataNums) { \
for (size_t i = 0; i < size_t(end_num); i++) { \
SHARD(i, i + 1); \
} \
} else { \
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), "MaxUnpool3D #SHARD Compute failed."); \
}
} // namespace
namespace aicpu {
template <typename DATA_T>
uint32_t MaxUnpool3DCpuKernel::MaxUnpool3D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
// Compute by indices_type
switch (indices_type) {
case DT_INT32:
return MaxUnpool3DCompute<DATA_T, int32_t>(ctx);
case DT_INT64:
return MaxUnpool3DCompute<DATA_T, int64_t>(ctx);
default:
KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
uint32_t MaxUnpool3DCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool3D check input and output number failed.");
KERNEL_HANDLE_ERROR(MaxUnpool3DCheck(ctx), "MaxUnpool3D check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
auto indices_type = ctx.Input(1)->GetDataType();
switch (data_type) {
case DT_INT8:
return MaxUnpool3D_COMPUTE_CASE<int8_t>(ctx, indices_type);
case DT_INT16:
return MaxUnpool3D_COMPUTE_CASE<int16_t>(ctx, indices_type);
case DT_INT32:
return MaxUnpool3D_COMPUTE_CASE<int32_t>(ctx, indices_type);
case DT_INT64:
return MaxUnpool3D_COMPUTE_CASE<int64_t>(ctx, indices_type);
case DT_UINT8:
return MaxUnpool3D_COMPUTE_CASE<uint8_t>(ctx, indices_type);
case DT_UINT16:
return MaxUnpool3D_COMPUTE_CASE<uint16_t>(ctx, indices_type);
case DT_UINT32:
return MaxUnpool3D_COMPUTE_CASE<uint32_t>(ctx, indices_type);
case DT_UINT64:
return MaxUnpool3D_COMPUTE_CASE<uint64_t>(ctx, indices_type);
case DT_FLOAT16:
return MaxUnpool3D_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
case DT_FLOAT:
return MaxUnpool3D_COMPUTE_CASE<float>(ctx, indices_type);
case DT_DOUBLE:
return MaxUnpool3D_COMPUTE_CASE<double>(ctx, indices_type);
default:
KERNEL_LOG_ERROR("MaxUnpool3D kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MaxUnpool3DCpuKernel::MaxUnpool3DCheck(CpuKernelContext &ctx) {
DataType input0Type = ctx.Input(0)->GetDataType();
DataType outputType = ctx.Output(0)->GetDataType();
KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
"The data type of output [%d] need be same with "
"input0 [%d].",
outputType, input0Type)
KERNEL_LOG_INFO(
"MaxUnpool3DCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename DATA_T, typename INDICES_T>
uint32_t MaxUnpool3DCpuKernel::MaxUnpool3DCompute(CpuKernelContext &ctx) {
Tensor *input = ctx.Input(0);
Tensor *indices = ctx.Input(1);
Tensor *output = ctx.Output(0);
std::string dataFormat = "NCDHW";
if (ctx.GetAttr("data_format") != nullptr) {
dataFormat = ctx.GetAttr("data_format")->GetString();
}
int32_t NIndex, CIndex, DIndex, HIndex, WIndex;
bool error = false;
if (dataFormat == "NDHWC") {
NIndex = 0;
CIndex = 4;
DIndex = 1;
HIndex = 2;
WIndex = 3;
auto input_shape = input->GetTensorShape();
int64_t numBatch = input_shape->GetDimSize(NIndex);
int64_t inputDepth = input_shape->GetDimSize(DIndex);
int64_t inputHeight = input_shape->GetDimSize(HIndex);
int64_t inputWidth = input_shape->GetDimSize(WIndex);
int64_t numChannels = input_shape->GetDimSize(CIndex);
auto output_shape = output->GetTensorShape();
int64_t odepth = output_shape->GetDimSize(DIndex);
int64_t oheight = output_shape->GetDimSize(HIndex);
int64_t owidth = output_shape->GetDimSize(WIndex);
auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
for (int s = 0; s < numBatch * odepth * oheight * owidth * numChannels; s++) {
rawOutput[s] = (DATA_T)0;
}
auto shard = [&](int64_t start, int64_t end) {
for (int64_t n = start; n < end; n++) {
int64_t nOutputOffset = n * numChannels * odepth * owidth * oheight;
int64_t nInputOffset = n * numChannels * inputDepth * inputWidth * inputHeight;
DATA_T *output_p_k = rawOutput + nOutputOffset;
DATA_T *input_p_k = rawInput + nInputOffset;
INDICES_T *ind_p_k = rawIndices + nInputOffset;
int64_t maxp;
for (int64_t k = 0; k < numChannels; k++) {
for (int64_t t = 0; t < inputDepth; t++) {
for (int64_t i = 0; i < inputHeight; i++) {
for (int64_t j = 0; j < inputWidth; j++) {
maxp = ind_p_k[t * inputHeight * inputWidth * numChannels + i * inputWidth * numChannels +
j * numChannels + k];
if (maxp < 0 || maxp >= odepth * owidth * oheight) {
error = true;
KERNEL_LOG_ERROR(
"MaxUnpool3D: output_size D_out * H_out * W_out "
"should be bigger than argmax, now D_out is [%ld], H_out "
"is [%ld], and W_out is [%ld], but one of the values in "
"argmax is [%ld].",
odepth, oheight, owidth, maxp);
} else {
output_p_k[maxp * numChannels + k] = input_p_k[t * inputHeight * inputWidth * numChannels +
i * inputWidth * numChannels + j * numChannels + k];
}
}
}
}
}
}
};
SWITCH_PARALLEL(shard, numBatch, ctx);
} else {
NIndex = 0;
CIndex = 1;
DIndex = 2;
HIndex = 3;
WIndex = 4;
auto input_shape = input->GetTensorShape();
int64_t numBatch = input_shape->GetDimSize(NIndex);
int64_t inputDepth = input_shape->GetDimSize(DIndex);
int64_t inputHeight = input_shape->GetDimSize(HIndex);
int64_t inputWidth = input_shape->GetDimSize(WIndex);
int64_t numChannels = input_shape->GetDimSize(CIndex);
auto output_shape = output->GetTensorShape();
int64_t odepth = output_shape->GetDimSize(DIndex);
int64_t oheight = output_shape->GetDimSize(HIndex);
int64_t owidth = output_shape->GetDimSize(WIndex);
auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
for (int s = 0; s < numBatch * odepth * oheight * owidth * numChannels; s++) {
rawOutput[s] = (DATA_T)0;
}
auto shard = [&](int64_t start, int64_t end) {
for (int64_t n = start; n < end; n++) {
int64_t nOutputOffset = n * numChannels * odepth * owidth * oheight;
int64_t nInputOffset = n * numChannels * inputDepth * inputWidth * inputHeight;
int64_t k = 0;
for (k = 0; k < numChannels; k++) {
int64_t finalOutputOffset = nOutputOffset + k * odepth * owidth * oheight;
int64_t finalInputOffset = nInputOffset + k * inputDepth * inputWidth * inputHeight;
DATA_T *output_p_k = rawOutput + finalOutputOffset;
DATA_T *input_p_k = rawInput + finalInputOffset;
INDICES_T *ind_p_k = rawIndices + finalInputOffset;
int64_t maxp;
for (int64_t t = 0; t < inputDepth; t++) {
for (int64_t i = 0; i < inputHeight; i++) {
for (int64_t j = 0; j < inputWidth; j++) {
maxp = ind_p_k[t * inputHeight * inputWidth + i * inputWidth + j];
if (maxp < 0 || maxp >= odepth * owidth * oheight) {
error = true;
KERNEL_LOG_ERROR(
"MaxUnpool3D: output_size D_out * H_out * W_out "
"should be bigger than argmax, now D_out is [%ld], H_out "
"is [%ld], and W_out is [%ld], but one of the values in "
"argmax is [%ld].",
odepth, oheight, owidth, maxp);
} else {
output_p_k[maxp] = input_p_k[t * inputHeight * inputWidth + i * inputWidth + j];
}
}
}
}
}
}
};
SWITCH_PARALLEL(shard, numBatch, ctx);
}
if (error == true) {
return KERNEL_STATUS_PARAM_INVALID;
} else {
return KERNEL_STATUS_OK;
}
}
REGISTER_CPU_KERNEL(kMaxUnpool3D, MaxUnpool3DCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_H_
#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class MaxUnpool3DCpuKernel : public CpuKernel {
public:
MaxUnpool3DCpuKernel() = default;
~MaxUnpool3DCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
static uint32_t MaxUnpool3DCheck(CpuKernelContext &ctx);
template <typename T>
static uint32_t MaxUnpool3D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
template <typename T, typename S>
static uint32_t MaxUnpool3DCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_H_

View File

@ -0,0 +1,258 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "max_unpool_3d_grad.h"
#include <cmath>
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 3;
constexpr int64_t kParallelDataNums = 1024;
const char *kMaxUnpool3DGrad = "MaxUnpool3DGrad";
#define SWITCH_PARALLEL(SHARD, end_num, ctx) \
if (end_num <= kParallelDataNums) { \
for (size_t i = 0; i < size_t(end_num); i++) { \
SHARD(i, i + 1); \
} \
} else { \
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), \
"MaxUnpool3DGrad #SHARD Compute failed."); \
}
} // namespace
namespace aicpu {
template <typename DATA_T>
uint32_t MaxUnpool3DGradCpuKernel::MaxUnpool3DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
// Compute by indices_type
switch (indices_type) {
case DT_INT32:
return MaxUnpool3DGradCompute<DATA_T, int32_t>(ctx);
case DT_INT64:
return MaxUnpool3DGradCompute<DATA_T, int64_t>(ctx);
default:
KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
uint32_t MaxUnpool3DGradCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool3DGrad check input and output number failed.");
KERNEL_HANDLE_ERROR(MaxUnpool3DGradCheck(ctx), "MaxUnpool3DGrad check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
auto indices_type = ctx.Input(2)->GetDataType();
switch (data_type) {
case DT_INT8:
return MaxUnpool3DGrad_COMPUTE_CASE<int8_t>(ctx, indices_type);
case DT_INT16:
return MaxUnpool3DGrad_COMPUTE_CASE<int16_t>(ctx, indices_type);
case DT_INT32:
return MaxUnpool3DGrad_COMPUTE_CASE<int32_t>(ctx, indices_type);
case DT_INT64:
return MaxUnpool3DGrad_COMPUTE_CASE<int64_t>(ctx, indices_type);
case DT_UINT8:
return MaxUnpool3DGrad_COMPUTE_CASE<uint8_t>(ctx, indices_type);
case DT_UINT16:
return MaxUnpool3DGrad_COMPUTE_CASE<uint16_t>(ctx, indices_type);
case DT_UINT32:
return MaxUnpool3DGrad_COMPUTE_CASE<uint32_t>(ctx, indices_type);
case DT_UINT64:
return MaxUnpool3DGrad_COMPUTE_CASE<uint64_t>(ctx, indices_type);
case DT_FLOAT16:
return MaxUnpool3DGrad_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
case DT_FLOAT:
return MaxUnpool3DGrad_COMPUTE_CASE<float>(ctx, indices_type);
case DT_DOUBLE:
return MaxUnpool3DGrad_COMPUTE_CASE<double>(ctx, indices_type);
default:
KERNEL_LOG_ERROR("MaxUnpool3DGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCheck(CpuKernelContext &ctx) {
DataType input0Type = ctx.Input(0)->GetDataType();
DataType input1Type = ctx.Input(1)->GetDataType();
DataType outputType = ctx.Output(0)->GetDataType();
KERNEL_CHECK_FALSE((input0Type == input1Type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input1Type [%d] need be same with "
"input0 [%d].",
input1Type, input0Type)
KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
"The data type of output [%d] need be same with "
"input0 [%d].",
outputType, input0Type)
auto Input0_size = ctx.Input(0)->GetTensorShape()->GetDimSizes();
auto Input2_size = ctx.Input(2)->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE((Input0_size == Input2_size), KERNEL_STATUS_PARAM_INVALID,
"The data size of x [%d] need be same with "
"input argmax [%d].",
Input0_size, Input2_size)
KERNEL_LOG_INFO(
"MaxUnpool3DGradCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], input2: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Input(2)->GetDataSize(),
ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename DATA_T, typename INDICES_T>
uint32_t MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute(CpuKernelContext &ctx) {
Tensor *grads = ctx.Input(1);
Tensor *indices = ctx.Input(2);
Tensor *output = ctx.Output(0);
std::string dataFormat = "NCDHW";
if (ctx.GetAttr("data_format") != nullptr) {
dataFormat = ctx.GetAttr("data_format")->GetString();
}
int32_t NIndex, CIndex, DIndex, HIndex, WIndex;
bool error = false;
if (dataFormat == "NDHWC") {
NIndex = 0;
CIndex = 4;
DIndex = 1;
HIndex = 2;
WIndex = 3;
auto grads_out_shape = grads->GetTensorShape();
int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
int64_t odepth = grads_out_shape->GetDimSize(DIndex);
int64_t oheight = grads_out_shape->GetDimSize(HIndex);
int64_t owidth = grads_out_shape->GetDimSize(WIndex);
int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
auto output_shape = output->GetTensorShape();
int64_t idepth = output_shape->GetDimSize(DIndex);
int64_t iheight = output_shape->GetDimSize(HIndex);
int64_t iwidth = output_shape->GetDimSize(WIndex);
auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
for (int s = 0; s < numBatch * iheight * iwidth * idepth * numChannels; s++) {
rawOutput[s] = (DATA_T)0;
}
auto shard = [&](int64_t start, int64_t end) {
for (int64_t n = start; n < end; n++) {
int64_t nOutputOffset = n * numChannels * iwidth * iheight * idepth;
int64_t nGradsOffset = n * numChannels * owidth * oheight * odepth;
DATA_T *output_p_k = rawOutput + nOutputOffset;
DATA_T *grads_p_k = rawGrads + nGradsOffset;
INDICES_T *ind_p_k = rawIndices + nOutputOffset;
int64_t maxp;
for (int64_t k = 0; k < numChannels; k++) {
for (int64_t t = 0; t < idepth; t++) {
for (int64_t i = 0; i < iheight; i++) {
for (int64_t j = 0; j < iwidth; j++) {
maxp = ind_p_k[t * iwidth * iheight * numChannels + i * iwidth * numChannels + j * numChannels + k];
if (maxp < 0 || maxp >= owidth * oheight * odepth) {
error = true;
KERNEL_LOG_ERROR(
"MaxUnpool3DGrad: output_size D_out * H_out * W_out "
"should be bigger than argmax, now D_out is [%ld], H_out "
"is [%ld], and W_out is [%ld], but one of the values in "
"argmax is [%ld].",
odepth, oheight, owidth, maxp);
} else {
output_p_k[t * iwidth * iheight * numChannels + i * iwidth * numChannels + j * numChannels + k] =
grads_p_k[maxp * numChannels + k];
}
}
}
}
}
}
};
SWITCH_PARALLEL(shard, numBatch, ctx);
} else {
NIndex = 0;
CIndex = 1;
DIndex = 2;
HIndex = 3;
WIndex = 4;
auto grads_out_shape = grads->GetTensorShape();
int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
int64_t odepth = grads_out_shape->GetDimSize(DIndex);
int64_t oheight = grads_out_shape->GetDimSize(HIndex);
int64_t owidth = grads_out_shape->GetDimSize(WIndex);
int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
auto output_shape = output->GetTensorShape();
int64_t idepth = output_shape->GetDimSize(DIndex);
int64_t iheight = output_shape->GetDimSize(HIndex);
int64_t iwidth = output_shape->GetDimSize(WIndex);
auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
for (int s = 0; s < numBatch * idepth * iheight * iwidth * numChannels; s++) {
rawOutput[s] = (DATA_T)0;
}
auto shard = [&](int64_t start, int64_t end) {
for (int64_t n = start; n < end; n++) {
int64_t nOutputOffset = n * numChannels * iwidth * iheight * idepth;
int64_t nGradsOffset = n * numChannels * owidth * oheight * odepth;
int64_t k = 0;
for (k = 0; k < numChannels; k++) {
int64_t finalOutputOffset = nOutputOffset + k * iwidth * iheight * idepth;
int64_t finalGradsOffset = nGradsOffset + k * owidth * oheight * odepth;
DATA_T *output_p_k = rawOutput + finalOutputOffset;
DATA_T *grads_p_k = rawGrads + finalGradsOffset;
INDICES_T *ind_p_k = rawIndices + finalOutputOffset;
int64_t maxp;
for (int64_t t = 0; t < idepth; t++) {
for (int64_t i = 0; i < iheight; i++) {
for (int64_t j = 0; j < iwidth; j++) {
maxp = ind_p_k[t * iheight * iwidth + i * iwidth + j];
if (maxp < 0 || maxp >= owidth * oheight * odepth) {
error = true;
KERNEL_LOG_ERROR(
"MaxUnpool3DGrad: output_size D_out * H_out * W_out "
"should be bigger than argmax, now D_out is [%ld], H_out "
"is [%ld], and W_out is [%ld], but one of the values in "
"argmax is [%ld].",
odepth, oheight, owidth, maxp);
} else {
output_p_k[t * iheight * iwidth + i * iwidth + j] = grads_p_k[maxp];
}
}
}
}
}
}
};
SWITCH_PARALLEL(shard, numBatch, ctx);
}
if (error == true) {
return KERNEL_STATUS_PARAM_INVALID;
} else {
return KERNEL_STATUS_OK;
}
}
REGISTER_CPU_KERNEL(kMaxUnpool3DGrad, MaxUnpool3DGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_GRAD_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class MaxUnpool3DGradCpuKernel : public CpuKernel {
public:
MaxUnpool3DGradCpuKernel() = default;
~MaxUnpool3DGradCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
static uint32_t MaxUnpool3DGradCheck(CpuKernelContext &ctx);
template <typename T>
static uint32_t MaxUnpool3DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
template <typename T, typename S>
static uint32_t MaxUnpool3DGradCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_GRAD_H_

View File

@ -0,0 +1,429 @@
/**
* Copyright 2021 Harbin Institute of Technology
* Copyright 2021 Huawei Technologies Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "maxpool_grad.h"
#include <Eigen/Dense>
#include <string>
#include <vector>
#include "cpu_kernel_utils.h"
#include "utils/allocator_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kMaxPoolGrad = "MaxPoolGrad";
constexpr uint32_t kInvalidMaxPoolingIndex = -1;
constexpr uint32_t kMaxPoolGradInputNum = 3;
constexpr uint32_t kMaxPoolGradOutputNum = 1;
constexpr int64_t kParallelNum_7K = 7 * 1024;
constexpr int64_t kParallelNum_16K = 16 * 1024;
constexpr int64_t kParallelNum_128K = 128 * 1024;
constexpr uint32_t kThirdInputIndex = 2;
struct PoolParams {
int depth;
int tensor_cols;
int tensor_rows;
int tensor_batch;
int ksize_rows;
int ksize_cols;
int ksize_depth;
int strides_rows;
int strides_cols;
int strides_depth;
int64_t out_height;
int64_t out_width;
int out_depth;
int64_t pad_top;
int64_t pad_bottom;
int64_t pad_left;
int64_t pad_right;
};
} // namespace
namespace aicpu {
template <typename T, typename Targmax>
uint32_t SpatialMaxPoolWithArgMaxHelper(CpuKernelContext &ctx, const PoolParams &params) {
bool include_batch_in_index = true;
Tensor *tensor_in = ctx.Input(kFirstInputIndex);
EigenTensor input_eigen_tensor(tensor_in, tensor_in->GetData());
Tensor *tensor_out = ctx.Input(kSecondInputIndex);
EigenTensor output_eigen_tensor(tensor_out, tensor_out->GetData());
Tensor *tensor_out_backprop = ctx.Input(2);
EigenTensor out_backprop(tensor_out_backprop, tensor_out_backprop->GetData());
Tensor *tensor_output_dup = ctx.Output(kFirstOutputIndex);
EigenTensor input_backprop(tensor_output_dup, tensor_output_dup->GetData());
// create a new aicpu::Tensor
auto tensor_out_arg_max_tmp = CpuKernelUtils::CreateTensor();
Targmax *arg_max = new Targmax[tensor_output_dup->NumElements()];
TensorShape out_dup_ts = *(tensor_output_dup->GetTensorShape());
tensor_out_arg_max_tmp->SetDataType(DT_INT64);
tensor_out_arg_max_tmp->SetData(static_cast<void *>(arg_max));
tensor_out_arg_max_tmp->SetDataSize(tensor_output_dup->GetDataSize());
auto out_arg_max_ts = tensor_out_arg_max_tmp->GetTensorShape();
out_arg_max_ts->SetFormat(out_dup_ts.GetFormat());
out_arg_max_ts->SetUnknownRank(out_dup_ts.GetUnknownRank());
out_arg_max_ts->SetDimSizes(out_dup_ts.GetDimSizes());
auto tensor_out_arg_max = tensor_out_arg_max_tmp.get();
EigenTensor output_arg_max(tensor_out_arg_max, tensor_out_arg_max->GetData());
typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenMatrixMap;
typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> EigenMatrixMap;
typedef Eigen::Map<Eigen::Matrix<Targmax, Eigen::Dynamic, Eigen::Dynamic>> EigenIndexMatrixMap;
ConstEigenMatrixMap in_mat(input_eigen_tensor.flat<T>().data(), params.depth,
params.tensor_cols * params.tensor_rows * params.tensor_batch);
EigenMatrixMap out_mat(output_eigen_tensor.flat<T>().data(), params.depth,
params.out_width * params.out_height * params.tensor_batch);
EigenIndexMatrixMap out_arg_max_mat(output_arg_max.flat<Targmax>().data(), params.depth,
params.out_width * params.out_height * params.tensor_batch);
input_backprop.flat<T>().setZero();
auto orig_input_ptr = static_cast<T *>(tensor_in->GetData());
auto orig_output_ptr = static_cast<T *>(tensor_out->GetData());
auto grad_ptr = static_cast<T *>(tensor_out_backprop->GetData());
auto output_ptr = static_cast<T *>(tensor_output_dup->GetData());
// shard_NCHW's limit is params.tensor_batch * params.depth
auto shard_NCHW = [&params, &orig_input_ptr, &orig_output_ptr, &grad_ptr, &output_ptr](int64_t start, int64_t limit) {
typedef Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenArrayMap;
typedef Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> EigenArrayMap;
const int64_t X_W = static_cast<int64_t>(params.tensor_cols), X_H = static_cast<int64_t>(params.tensor_rows);
const int64_t Y_W = params.out_width, Y_H = params.out_height;
const int64_t batch_size = limit;
const int64_t X_HxW = X_H * X_W, Y_HxW = Y_H * Y_W;
const int64_t X_stride = X_HxW, Y_stride = Y_HxW;
const int64_t stride_h = static_cast<int64_t>(params.strides_rows),
stride_w = static_cast<int64_t>(params.strides_cols);
const int64_t pad_t = params.pad_top, pad_l = params.pad_left;
const int64_t kernel_h = static_cast<int64_t>(params.ksize_rows),
kernel_w = static_cast<int64_t>(params.ksize_cols);
const T *dy_ptr = grad_ptr + start * Y_stride;
const T *x_ptr = orig_input_ptr + start * X_stride;
const T *y_ptr = orig_output_ptr + start * Y_stride;
T *dx_ptr = output_ptr + start * X_stride;
for (int64_t i = start; i < batch_size; i++) {
ConstEigenArrayMap dy_arr(dy_ptr, Y_W, Y_H);
ConstEigenArrayMap x_arr(x_ptr, X_W, X_H);
ConstEigenArrayMap y_arr(y_ptr, Y_W, Y_H);
EigenArrayMap dx_arr(dx_ptr, X_W, X_H);
for (int64_t h = 0; h < Y_H; ++h) {
const int64_t t = std::max(h * stride_h - pad_t, static_cast<int64_t>(0));
const int64_t b = std::min(h * stride_h - pad_t + kernel_h, X_H);
for (int64_t w = 0; w < Y_W; ++w) {
const int64_t l = std::max(w * stride_w - pad_l, static_cast<int64_t>(0));
const int64_t r = std::min(w * stride_w - pad_l + kernel_w, X_W);
const int64_t y = h * Y_W + w;
auto some_max_block = (x_arr.block(l, t, r - l, b - t) == y_arr(y)).template cast<T>();
int64_t first_max_x_rel = 0, first_max_y_rel = 0;
bool max_found = false;
for (int64_t by = 0; by < b - t; ++by) {
for (int64_t bx = 0; bx < r - l; ++bx) {
if (some_max_block(bx, by) == static_cast<T>(1)) {
first_max_x_rel = bx, first_max_y_rel = by, max_found = true;
break;
}
}
if (max_found) {
break;
}
}
const int64_t fact_index_h = t + first_max_y_rel, fact_index_w = l + first_max_x_rel;
*(dx_ptr + fact_index_h * X_W + fact_index_w) += static_cast<T>(1) * dy_arr(y);
}
}
dy_ptr += Y_stride;
x_ptr += X_stride;
y_ptr += Y_stride;
dx_ptr += X_stride;
}
};
auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop, &output_arg_max, &out_backprop,
&tensor_out_backprop, include_batch_in_index](int64_t start, int64_t limit) {
const int32_t depth = params.depth;
const int32_t in_rows = params.tensor_rows;
const int32_t in_cols = params.tensor_cols;
const int32_t pad_top = params.pad_top;
const int32_t pad_left = params.pad_left;
const int32_t window_rows = params.ksize_rows;
const int32_t window_cols = params.ksize_cols;
const int32_t row_stride = params.strides_rows;
const int32_t col_stride = params.strides_cols;
const int32_t out_height = params.out_height;
const int32_t out_width = params.out_width;
{
const int32_t output_image_size = out_height * out_width * depth;
EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 1, (limit - start) * output_image_size);
out_shard.setConstant(Eigen::NumTraits<T>::lowest());
EigenIndexMatrixMap out_arg_max_shard(out_arg_max_mat.data() + start * output_image_size, 1,
(limit - start) * output_image_size);
out_arg_max_shard.setConstant(kInvalidMaxPoolingIndex);
}
for (int64_t b = start; b < limit; ++b) {
for (int h = 0; h < in_rows; ++h) {
for (int w = 0; w < in_cols; ++w) {
const int hpad = h + pad_top;
const int wpad = w + pad_left;
const int h_start = (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1;
const int h_end = std::min(hpad / row_stride + 1, out_height);
const int w_start = (wpad < window_cols) ? 0 : (wpad - window_cols) / col_stride + 1;
const int w_end = std::min(wpad / col_stride + 1, out_width);
const int64_t in_index = (b * in_rows + h) * in_cols + w;
for (int ph = h_start; ph < h_end; ++ph) {
const int64_t out_index_base = (b * out_height + ph) * out_width;
for (int pw = w_start; pw < w_end; ++pw) {
const int64_t out_index = out_index_base + pw;
for (int d = 0; d < depth; ++d) {
const T &input_ref = in_mat.coeffRef(d, in_index);
T &output_ref = out_mat.coeffRef(d, out_index);
Targmax &out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index);
if (output_ref < input_ref || out_arg_max_ref == kInvalidMaxPoolingIndex) {
output_ref = input_ref;
if (include_batch_in_index) {
out_arg_max_ref = in_index * depth + d;
} else {
out_arg_max_ref = (h * in_cols + w) * depth + d;
}
}
}
}
}
}
}
}
if (include_batch_in_index) {
auto input_backprop_flat = input_backprop.flat<T>();
auto out_arg_max_flat = output_arg_max.flat<int64_t>();
auto out_backprop_flat = out_backprop.flat<T>();
const int64_t in_size = in_rows * in_cols * depth;
const int64_t in_start = start * in_size;
const int64_t in_end = limit * in_size;
EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1, in_end - in_start);
in_shard.setConstant(T(0));
// Backpropagate.
const int out_size = out_height * out_width * depth;
const int out_start = start * out_size;
const int out_end = limit * out_size;
for (int index = out_start; index < out_end; ++index) {
int input_backprop_index = out_arg_max_flat(index);
// BoundsCheck
if (input_backprop_index - in_start >= 0 && input_backprop_index - in_end < 0) {
if (index < (tensor_out_backprop->NumElements())) {
input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
}
} else {
KERNEL_LOG_ERROR("[MaxPoolGrad] Backpropagate boundsCheck failed");
return KERNEL_STATUS_PARAM_INVALID;
}
}
}
return KERNEL_STATUS_PARAM_INVALID;
};
const int64_t total_elements = params.tensor_batch * params.tensor_rows * params.tensor_cols * params.depth;
if (ctx.GetAttr("data_format") != nullptr && ctx.GetAttr("data_format")->GetString() == "NCHW") {
const int64_t total_images = params.tensor_batch * params.depth;
if (total_elements <= kParallelNum_16K) {
shard_NCHW(0, total_images);
return KERNEL_STATUS_OK;
} else {
return CpuKernelUtils::ParallelFor(ctx, total_images, 1, shard_NCHW);
}
}
uint32_t tensor_batch = params.tensor_batch;
if (total_elements <= kParallelNum_7K) {
shard(0, params.tensor_batch);
return KERNEL_STATUS_OK;
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (total_elements <= kParallelNum_16K) {
max_core_num = std::min(max_core_num, 4U);
}
if (total_elements >= kParallelNum_128K || max_core_num > tensor_batch) {
max_core_num = params.tensor_batch;
}
return CpuKernelUtils::ParallelFor(ctx, params.tensor_batch, params.tensor_batch / max_core_num, shard);
}
}
uint32_t CheckMaxPoolGrad(CpuKernelContext &ctx) {
Tensor *tensor_in = ctx.Input(kFirstInputIndex);
Tensor *tensor_out = ctx.Input(kSecondInputIndex);
Tensor *out_backprop = ctx.Input(kThirdInputIndex);
const std::vector<std::string> attr = {"ksize", "strides", "padding"};
KERNEL_CHECK_FALSE(NormalCheck(ctx, kMaxPoolGradInputNum, kMaxPoolGradOutputNum, attr) == KERNEL_STATUS_OK,
KERNEL_STATUS_PARAM_INVALID, "[MaxPoolGrad] NormalCheck input and output failed.");
// check tensor_in dims
Tensor &input0 = *(tensor_in);
auto input_shape_ptr = input0.GetTensorShape();
KERNEL_CHECK_FALSE(input_shape_ptr->GetDims() == 4, KERNEL_STATUS_PARAM_INVALID,
"Non-empty [4D] tensor expected for input(0).");
// check tensor_out dims
Tensor &input1 = *(tensor_out);
auto output_shape_ptr = input1.GetTensorShape();
KERNEL_CHECK_FALSE(output_shape_ptr->GetDims() == 4, KERNEL_STATUS_PARAM_INVALID,
"Non-empty [4D] tensor expected for input(1).");
// check out_backprop dims
Tensor &input2 = *(out_backprop);
auto grad_shape_ptr = input2.GetTensorShape();
KERNEL_CHECK_FALSE(grad_shape_ptr->GetDims() == 4, KERNEL_STATUS_PARAM_INVALID,
"Non-empty [4D] tensor expected for input(2).");
// check output data
KERNEL_LOG_DEBUG("[MaxPoolGrad] Parameters check pass.");
return KERNEL_STATUS_OK;
}
uint32_t GetOutputSizeGrad(int input_size, int kernel_size, int stride, const std::string &padding,
int64_t *output_size, int64_t *padding_before, int64_t *padding_after) {
KERNEL_CHECK_FALSE(stride > 0, KERNEL_STATUS_PARAM_INVALID, "[MaxPoolGrad] Stride must be positive.");
std::string same("SAME"), valid("VALID");
if (valid == padding) {
*output_size = (input_size - kernel_size + stride) / stride;
*padding_before = 0;
*padding_after = 0;
} else if (same == padding) {
*output_size = (input_size + stride - 1) / stride;
const int64_t padding_need =
std::max(static_cast<int64_t>(0), (*output_size - 1) * stride + kernel_size - input_size);
*padding_before = padding_need / 2;
*padding_after = padding_need - *padding_before;
} else {
KERNEL_LOG_ERROR("[MaxPoolGrad] Padding is invalid.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (*output_size < 0) {
KERNEL_LOG_ERROR("[MaxPoolGrad] Computed output size is negative.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t ConstructPoolParams(aicpu::CpuKernelContext &ctx, const aicpu::TensorShape &data_format, PoolParams &params) {
Format format = data_format.GetFormat();
KERNEL_CHECK_FALSE((format == FORMAT_NHWC || format == FORMAT_NCHW), KERNEL_STATUS_PARAM_INVALID,
"[MaxPoolGrad] Format is not NHWC or NCHW.");
std::vector<int64_t> tensor_in_shapes = data_format.GetDimSizes();
std::vector<int64_t> ksize = ctx.GetAttr("ksize")->GetListInt(), strides = ctx.GetAttr("strides")->GetListInt();
std::string padding = ctx.GetAttr("padding")->GetString();
std::string data_format_str = "";
if (ctx.GetAttr("data_format") == nullptr) {
KERNEL_LOG_INFO("[MaxPoolGrad] Attr data_format is empty, using default value NHWC.");
format = FORMAT_NHWC;
} else {
std::map<std::string, aicpu::Format> format_str_to_enum_map = {{"NHWC", FORMAT_NHWC}, {"NCHW", FORMAT_NCHW}};
data_format_str = ctx.GetAttr("data_format")->GetString();
KERNEL_HANDLE_ERROR(format_str_to_enum_map.find(data_format_str) == format_str_to_enum_map.end(),
"[MaxPoolGrad] data_format string is invalid.");
format = format_str_to_enum_map[data_format_str];
}
switch (format) {
case FORMAT_NHWC:
params.depth = tensor_in_shapes[kFormatNHWCIndexC];
params.tensor_rows = tensor_in_shapes[kFormatNHWCIndexH];
params.tensor_cols = tensor_in_shapes[kFormatNHWCIndexW];
params.tensor_batch = tensor_in_shapes[kFormatNHWCIndexN];
params.ksize_rows = ksize[kFormatNHWCIndexH];
params.ksize_cols = ksize[kFormatNHWCIndexW];
params.ksize_depth = ksize[kFormatNHWCIndexC];
params.strides_rows = strides[kFormatNHWCIndexH];
params.strides_cols = strides[kFormatNHWCIndexW];
params.strides_depth = strides[kFormatNHWCIndexC];
break;
case FORMAT_NCHW:
params.depth = tensor_in_shapes[kFormatNCHWIndexC];
params.tensor_rows = tensor_in_shapes[kFormatNCHWIndexH];
params.tensor_cols = tensor_in_shapes[kFormatNCHWIndexW];
params.tensor_batch = tensor_in_shapes[kFormatNCHWIndexN];
params.ksize_rows = ksize[kFormatNCHWIndexH];
params.ksize_cols = ksize[kFormatNCHWIndexW];
params.ksize_depth = ksize[kFormatNCHWIndexC];
params.strides_rows = strides[kFormatNCHWIndexH];
params.strides_cols = strides[kFormatNCHWIndexW];
params.strides_depth = strides[kFormatNCHWIndexC];
break;
default:
KERNEL_LOG_ERROR("[MaxPoolGrad] Format is not NHWC or NCHW, current is [%d].", format);
return KERNEL_STATUS_PARAM_INVALID;
}
// 1 types of pooling is supported: 2d pooling on w/h
// depth pooling on channel is not supported
KERNEL_CHECK_FALSE(params.ksize_depth == 1, KERNEL_STATUS_PARAM_INVALID,
"[MaxPoolGrad] Only pooling on width/height is supported.");
// Padding calc
if (params.ksize_depth == 1) {
uint32_t ret1 = GetOutputSizeGrad(params.tensor_rows, params.ksize_rows, params.strides_rows, padding,
&params.out_height, &params.pad_top, &params.pad_bottom);
uint32_t ret2 = GetOutputSizeGrad(params.tensor_cols, params.ksize_cols, params.strides_cols, padding,
&params.out_width, &params.pad_left, &params.pad_right);
KERNEL_CHECK_FALSE(ret1 == KERNEL_STATUS_OK && ret2 == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
"[MaxPoolGrad] An error occurred while calculating output size.");
params.out_depth = params.depth;
}
return KERNEL_STATUS_OK;
}
template <class T>
uint32_t ComputeMaxPoolGradImpl(CpuKernelContext &ctx) {
TensorShape ts = *(ctx.Input(kFirstInputIndex)->GetTensorShape());
PoolParams params;
KERNEL_CHECK_FALSE(ConstructPoolParams(ctx, ts, params) == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
"[MaxPoolGrad] Parameters construct failed.")
return SpatialMaxPoolWithArgMaxHelper<T, int64_t>(ctx, params);
}
uint32_t MaxPoolGradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_CHECK_FALSE(CheckMaxPoolGrad(ctx) == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
"[MaxPoolGrad] Parameters check failure.");
DataType input_type = ctx.Input(kFirstInputIndex)->GetDataType();
switch (input_type) {
case DT_FLOAT16:
return ComputeMaxPoolGradImpl<Eigen::half>(ctx);
case DT_FLOAT:
return ComputeMaxPoolGradImpl<float>(ctx);
case DT_DOUBLE:
return ComputeMaxPoolGradImpl<double>(ctx);
case DT_INT8:
return ComputeMaxPoolGradImpl<int8_t>(ctx);
case DT_INT16:
return ComputeMaxPoolGradImpl<int16_t>(ctx);
case DT_INT32:
return ComputeMaxPoolGradImpl<int32_t>(ctx);
case DT_INT64:
return ComputeMaxPoolGradImpl<int64_t>(ctx);
case DT_UINT8:
return ComputeMaxPoolGradImpl<uint8_t>(ctx);
case DT_UINT16:
return ComputeMaxPoolGradImpl<uint16_t>(ctx);
case DT_UINT32:
return ComputeMaxPoolGradImpl<uint32_t>(ctx);
case DT_UINT64:
return ComputeMaxPoolGradImpl<uint64_t>(ctx);
default:
KERNEL_LOG_ERROR("[MaxPoolGrad] Input Data type [%s] is not supported.", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMaxPoolGrad, MaxPoolGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,30 @@
/**
* Copyright 2021 Harbin Institute of Technology
* Copyright 2021 Huawei Technologies Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MAX_POOL_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_MAX_POOL_GRAD_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class MaxPoolGradCpuKernel : public CpuKernel {
public:
~MaxPoolGradCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,253 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "mirror_pad.h"
#include "Eigen/Core"
#include "Eigen/Dense"
#include "cpu_kernel_utils.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/eigen_tensor.h"
#include "utils/equal_util.h"
#include "utils/kernel_util.h"
namespace {
constexpr uint32_t kMirrotPadInputNum = 2;
constexpr uint32_t kMirrotPadOutputNum = 1;
const char *kMirrorPad = "MirrorPad";
constexpr int kMinDims = 0;
constexpr int kMaxDims = 5;
constexpr int kTwo = 2;
std::vector<std::string> attr_names;
std::vector<int64_t> input_dim_shape;
std::vector<int64_t> output_dim_shape;
std::vector<std::pair<int64_t, int64_t>> padding_;
std::vector<uint64_t> input_strides_;
std::vector<uint64_t> output_strides_;
int64_t input_num_elements;
int64_t output_num_elements;
int32_t dims_;
int64_t offset_;
} // namespace
namespace aicpu {
template <typename T>
uint32_t MirrorPadCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
// check params
attr_names.emplace_back("mode");
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMirrotPadInputNum, kMirrotPadOutputNum, attr_names),
"[%s] check params failed.", kMirrorPad);
// get Attr mode
AttrValue *mode_ptr = ctx.GetAttr("mode");
auto mode = mode_ptr->GetString();
KERNEL_CHECK_FALSE((mode == "SYMMETRIC" || mode == "REFLECT"), KERNEL_STATUS_PARAM_INVALID,
"Attr mode must be either REFLECT or SYMMETRIC, but got attr mode[%s]", mode);
if (mode == "SYMMETRIC") {
offset_ = 0;
} else if (mode == "REFLECT") {
offset_ = 1;
}
// get input x
Tensor *x_ptr = ctx.Input(0);
data_type_ = x_ptr->GetDataType();
auto x_shape_ptr = x_ptr->GetTensorShape();
auto dims = x_shape_ptr->GetDims();
dims_ = x_shape_ptr->GetDims();
KERNEL_CHECK_FALSE((kMinDims <= dims && dims <= kMaxDims), KERNEL_STATUS_PARAM_INVALID,
"inputs rank not in [%lld, %lld]: %lld", kMinDims, kMaxDims, dims);
// get input paddings
Tensor *paddings_ptr = ctx.Input(1);
auto paddings_shape_ptr = paddings_ptr->GetTensorShape();
KERNEL_CHECK_FALSE((paddings_ptr->GetDataType() == DT_INT32 || paddings_ptr->GetDataType() == DT_INT64),
KERNEL_STATUS_PARAM_INVALID,
"Input split_dim data type must be DT_INT32 or DT_INT64, "
"but got data type[%s]",
DTypeStr(paddings_ptr->GetDataType()).c_str());
KERNEL_CHECK_FALSE(IsMatrix(paddings_shape_ptr->GetDimSizes()) && paddings_shape_ptr->GetDimSize(1),
KERNEL_STATUS_PARAM_INVALID, "paddings must be a matrix with 2 columns: [%lld] ",
paddings_shape_ptr->GetDimSizes());
KERNEL_CHECK_FALSE(dims == paddings_shape_ptr->GetDimSize(0), KERNEL_STATUS_PARAM_INVALID,
"The first dimension of paddings must be the rank of inputs [%lld] , "
"[%lld]",
x_shape_ptr->GetDimSizes(), paddings_shape_ptr->GetDimSizes());
// Compute the shape of the output tensor, and allocate it.
auto size_pads_data = reinterpret_cast<T *>(paddings_ptr->GetData());
input_num_elements = 1;
output_num_elements = 1;
for (int d = 0; d < dims_; ++d) {
int64_t before = *(size_pads_data + d * 2);
int64_t after = *(size_pads_data + d * 2 + 1);
padding_.push_back(std::make_pair(before, after));
KERNEL_CHECK_FALSE(before >= 0 && after >= 0, KERNEL_STATUS_PARAM_INVALID,
"paddings must be non-negative: [%lld] [%lld]", before, after);
if (offset_ == 0) {
KERNEL_CHECK_FALSE(before <= x_shape_ptr->GetDimSize(d) && after <= x_shape_ptr->GetDimSize(d),
KERNEL_STATUS_PARAM_INVALID,
"paddings must be no greater "
"than the dimension size: [%lld] , [%lld] greater than [%lld] ",
before, after, x_shape_ptr->GetDimSize(d));
} else if (offset_ == 1) {
KERNEL_CHECK_FALSE(before < x_shape_ptr->GetDimSize(d) && after < x_shape_ptr->GetDimSize(d),
KERNEL_STATUS_PARAM_INVALID,
"paddings must be no greater "
"than the dimension size: [%lld] , [%lld] not less than [%lld] ",
before, after, x_shape_ptr->GetDimSize(d));
}
input_dim_shape.push_back(x_shape_ptr->GetDimSize(d));
int64_t dimi = after + x_shape_ptr->GetDimSize(d) + before;
input_num_elements *= x_shape_ptr->GetDimSize(d);
output_num_elements *= dimi;
output_dim_shape.push_back(dimi);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MirrorPadCpuKernel::DoCompute(CpuKernelContext &ctx) {
auto input_data_ptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
if (output_num_elements == ctx.Input(0)->NumElements() || dims_ == 0) {
uint64_t copy_size = ctx.Input(0)->GetDataSize();
auto mem_ret = memcpy_s(output_data, copy_size, input_data_ptr, copy_size);
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
"Memcpy size[%zu] from input value to output failed.", copy_size);
} else {
KERNEL_CHECK_FALSE((MirrorPadCompute<T>(input_data_ptr, output_data) == KERNEL_STATUS_OK),
KERNEL_STATUS_PARAM_INVALID, "MirrorPadCompute failed.");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MirrorPadCpuKernel::MirrorPadCompute(T *input_data_ptr, T *output_data_ptr) {
input_strides_.resize(dims_);
output_strides_.resize(dims_);
input_strides_[dims_ - 1] = 1;
output_strides_[dims_ - 1] = 1;
for (int i = dims_ - 1; i > 0; --i) {
input_strides_[i - 1] = input_strides_[i] * input_dim_shape[i];
output_strides_[i - 1] = output_strides_[i] * output_dim_shape[i];
}
std::vector<std::pair<int64_t, int64_t>> index;
index.resize(dims_);
index[dims_ - 1] = std::make_pair(output_strides_[dims_ - 1] * padding_[dims_ - 1].first,
output_strides_[dims_ - 1] * padding_[dims_ - 1].second);
for (int i = dims_ - 1; i > 0; --i) {
index[i - 1].first = index[i].first + output_strides_[i - 1] * padding_[i - 1].first;
index[i - 1].second = index[i].second + output_strides_[i - 1] * padding_[i - 1].second;
}
if (dims_ == 1) {
memcpy_s(output_data_ptr, padding_[0].first * sizeof(T), input_data_ptr + offset_, padding_[0].first * sizeof(T));
memcpy_s(output_data_ptr + padding_[0].first + input_num_elements, padding_[0].second * sizeof(T),
input_data_ptr + input_num_elements - padding_[0].second - offset_, padding_[0].second * sizeof(T));
memcpy_s(output_data_ptr + padding_[0].first, input_num_elements * sizeof(T), input_data_ptr,
input_num_elements * sizeof(T));
std::reverse(output_data_ptr, output_data_ptr + padding_[0].first);
std::reverse(output_data_ptr + padding_[0].first + input_num_elements,
output_data_ptr + padding_[0].first + input_num_elements + padding_[0].second);
return KERNEL_STATUS_OK;
}
std::vector<int64_t> pos;
std::vector<int64_t> output_pos, tmp_pos;
pos.resize(dims_ - 1, 0);
int64_t output_index = index[0].first;
int64_t inx = 0, copy_size = sizeof(T) * input_dim_shape[dims_ - 1];
while (inx < input_num_elements) {
memcpy_s(output_data_ptr + output_index, copy_size, input_data_ptr + inx, copy_size);
output_pos.push_back(output_index);
pos[dims_ - kTwo] += 1;
int64_t dep = dims_ - 1;
for (int64_t i = dims_ - 2; i >= 0; --i) {
if (i > 0 && pos[i] >= input_dim_shape[i]) {
pos[i] -= input_dim_shape[i];
pos[i - 1] += 1;
dep = i;
} else {
break;
}
}
output_index += index[dep].first + index[dep].second + input_dim_shape[dims_ - 1];
inx += input_dim_shape[dims_ - 1];
}
for (int64_t i = dims_ - 1; i >= 0; --i) {
int64_t block_size = output_strides_[i], count = 0;
copy_size = block_size * sizeof(T);
for (auto item : output_pos) {
T *base_output_ptr1 = output_data_ptr + item;
for (int64_t cnt = 1; cnt <= padding_[i].first; ++cnt) {
memcpy_s(base_output_ptr1 - cnt * block_size, copy_size, base_output_ptr1 + (cnt - 1 + offset_) * block_size,
copy_size);
}
T *base_output_ptr2 = output_data_ptr + item + input_dim_shape[i] * block_size;
for (int64_t cnt = 1; cnt <= padding_[i].second; ++cnt) {
memcpy_s(base_output_ptr2 + (cnt - 1) * block_size, copy_size, base_output_ptr2 - (cnt + offset_) * block_size,
copy_size);
}
if (i > 0 && count % input_dim_shape[i - 1] == 0) {
tmp_pos.push_back(item - padding_[i].first * block_size);
}
++count;
}
output_pos.clear();
for (auto item : tmp_pos) {
output_pos.push_back(item);
}
tmp_pos.clear();
}
return KERNEL_STATUS_OK;
}
uint32_t MirrorPadCpuKernel::Compute(CpuKernelContext &ctx) {
auto padding_type_ = ctx.Input(1)->GetDataType();
if (padding_type_ == DT_INT32) {
KERNEL_CHECK_FALSE((CheckAndInitParams<int32_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"CheckAndInitParams failed.");
} else {
KERNEL_CHECK_FALSE((CheckAndInitParams<int64_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"CheckAndInitParams failed.");
}
switch (data_type_) {
case DT_FLOAT16:
return DoCompute<Eigen::half>(ctx);
case DT_FLOAT:
return DoCompute<float>(ctx);
case DT_DOUBLE:
return DoCompute<double>(ctx);
case DT_BOOL:
return DoCompute<bool>(ctx);
case DT_INT8:
return DoCompute<int8_t>(ctx);
case DT_INT16:
return DoCompute<int16_t>(ctx);
case DT_INT32:
return DoCompute<int32_t>(ctx);
case DT_INT64:
return DoCompute<int64_t>(ctx);
case DT_UINT8:
return DoCompute<uint8_t>(ctx);
case DT_UINT16:
return DoCompute<uint16_t>(ctx);
case DT_COMPLEX64:
return DoCompute<std::complex<float>>(ctx);
case DT_COMPLEX128:
return DoCompute<std::complex<double>>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported datatype[%s]", DTypeStr(data_type_).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
REGISTER_CPU_KERNEL(kMirrorPad, MirrorPadCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,64 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MIRROR_PAD_H_
#define AICPU_KERNELS_NORMALIZED_MIRROR_PAD_H_
#include <algorithm>
#include <memory>
#include <utility>
#include <vector>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "securec.h"
#include "status.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace aicpu {
class MirrorPadCpuKernel : public CpuKernel {
public:
MirrorPadCpuKernel() = default;
~MirrorPadCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
/**
* @brief Init params
* @param ctx cpu kernel context
* @return status if success
*/
template <typename T>
uint32_t CheckAndInitParams(CpuKernelContext &ctx);
/**
* @brief padding
* @param input_data_ptr ptr which store input data
* @param output_data_ptr ptr which store output data
* @return status if success
*/
template <typename T>
uint32_t MirrorPadCompute(T *input_data_ptr, T *output_data_ptr);
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
private:
DataType data_type_;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,320 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "multi_margin_loss.h"
#include <Eigen/Dense>
#include <algorithm>
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const char *kMultiMarginLoss = "MultiMarginLoss";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 28 * 1024;
} // namespace
namespace aicpu {
uint32_t MultiMarginLossCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
uint32_t kInputNum = 3;
constexpr int SERV_TYPE_SET = 2;
if (ctx.GetInputsSize() == SERV_TYPE_SET) {
kInputNum = SERV_TYPE_SET;
}
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MultiMarginLoss check input and output number failed.");
KERNEL_HANDLE_ERROR(MultiMarginLossCheck(ctx), "MultiMarginLoss check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
case DT_FLOAT16:
return MultiMarginLossComputeFP16<Eigen::half>(ctx);
case DT_FLOAT:
return MultiMarginLossCompute<float>(ctx);
case DT_DOUBLE:
return MultiMarginLossCompute<double>(ctx);
default:
KERNEL_LOG_ERROR("MultiMarginLoss kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MultiMarginLossCpuKernel::MultiMarginLossCheck(CpuKernelContext &ctx) {
auto input_0 = ctx.Input(0);
auto input_1 = ctx.Input(1);
constexpr int SERV_TYPE_SET = 2;
constexpr int SERV_TYPE_QUERY = 3;
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input1_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
"The data type of target [%s] should be int64.", DTypeStr(input1_type).c_str())
auto target = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
int64_t target_num = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
int64_t batch_size = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
if (ctx.GetInputsSize() == SERV_TYPE_QUERY) {
auto input_weight = ctx.Input(2);
DataType input2_type = input_weight->GetDataType();
KERNEL_CHECK_FALSE((input2_type == input0_type), KERNEL_STATUS_PARAM_INVALID,
"weight should have the same dtype with x, but get [%s].", DTypeStr(input2_type).c_str())
}
KERNEL_CHECK_FALSE((ctx.Input(0)->GetTensorShape()->GetDims() == SERV_TYPE_SET), KERNEL_STATUS_PARAM_INVALID,
"Rank of x should be 2.")
KERNEL_CHECK_FALSE((ctx.Input(1)->GetTensorShape()->GetDims() == 1), KERNEL_STATUS_PARAM_INVALID,
"Rank of target should be 1.")
KERNEL_CHECK_FALSE((batch_size == ctx.Input(1)->GetTensorShape()->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
"[%s] 's x's shape[0] should be the same as target's "
"shape[0].",
ctx.GetOpType().c_str())
for (int64_t i = 0; i < batch_size; i++) {
KERNEL_CHECK_FALSE(*(target + i) >= 0 && (*(target + i) < target_num), KERNEL_STATUS_PARAM_INVALID,
"[%s]'s target out of range", ctx.GetOpType().c_str());
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MultiMarginLossCpuKernel::MultiMarginLossCompute(CpuKernelContext &ctx) {
constexpr int SERV_TYPE_BRWD = 1;
constexpr int SERV_TYPE_SET = 2;
constexpr int ADULT_AGE = 4;
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_target = ctx.Input(1)->GetTensorShape()->GetDimSizes();
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input_target = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
T *input_weight = nullptr;
bool weight_defined_ = (ctx.GetInputsSize() == 3);
if (weight_defined_) {
input_weight = reinterpret_cast<T *>(ctx.Input(2)->GetData());
int64_t weight_length = ctx.Input(2)->NumElements();
int64_t x_length = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
if (weight_length < x_length) {
for (int64_t i = 0; i < x_length - weight_length; i++) {
input_weight[i + weight_length] = static_cast<T>(0);
}
}
}
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
AttrValue *Attr_p = ctx.GetAttr("p");
int p = (Attr_p == nullptr) ? 1 : Attr_p->GetInt();
if (p != SERV_TYPE_BRWD && p != SERV_TYPE_SET) {
KERNEL_LOG_ERROR("MultiMarginLoss kernel attr p should be 1 or 2.");
return KERNEL_STATUS_PARAM_INVALID;
}
AttrValue *Attr_margin = ctx.GetAttr("margin");
T margin = static_cast<T>((Attr_margin == nullptr) ? 1 : Attr_margin->GetFloat());
AttrValue *Attr_red = ctx.GetAttr("reduction");
std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
int64_t batch_size = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
int64_t dims = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
Eigen::Array<T, Eigen::Dynamic, 1> output(batch_size, 1);
output.setZero();
auto output_data = output.data();
int64_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, (int64_t)aicpu::CpuKernelUtils::GetCPUNum(ctx));
auto shard_multi_margin_loss = [&](size_t start, size_t end) {
int64_t once_compute_thread_size = end - start;
Eigen::Array<T, Eigen::Dynamic, 1> cacl(dims, 1);
auto cacl_data = cacl.data();
cacl.setZero();
if (dims == 0) {
KERNEL_LOG_ERROR("dims could not be 0.");
}
for (int64_t m = 0; m < (once_compute_thread_size) / dims; m++) {
int64_t i = start / dims;
for (int64_t d = 0; d < dims; d++) {
if (d == input_target[i]) {
continue;
}
cacl_data[d] = margin + input_x[start + d] - input_x[start + input_target[i]];
if (cacl_data[d] > T(0)) {
cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
if (weight_defined_) {
cacl_data[d] *= (input_weight[input_target[i]]);
}
output_data[i] += cacl_data[d];
}
}
output_data[i] = output_data[i] / static_cast<T>(dims);
start += dims;
}
};
if ((ctx.Input(0)->NumElements()) * sizeof(T) <= kParallelDataNum) {
Eigen::Array<T, Eigen::Dynamic, 1> cacl(dims, 1);
auto cacl_data = cacl.data();
cacl.setZero();
T sum = static_cast<T>(0);
for (int64_t i = 0; i < batch_size; i++) {
int64_t target_idx = input_target[i];
sum = static_cast<T>(0);
cacl.setZero();
for (int64_t d = 0; d < dims; d++) {
if (d == target_idx) {
continue;
}
cacl_data[d] = margin + input_x[i * dims + d] - input_x[i * dims + target_idx];
if (cacl_data[d] > T(0)) {
cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
if (weight_defined_) {
cacl_data[d] *= static_cast<T>(input_weight[target_idx]);
}
sum += cacl_data[d];
}
}
sum = sum / static_cast<T>(dims);
output_data[i] = sum;
}
} else {
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
CpuKernelUtils::ParallelFor(ctx, ctx.Input(0)->NumElements(), dims * ADULT_AGE * (batch_size / max_core_num + 1),
shard_multi_margin_loss);
}
if (reduction == "mean") {
*output_y = output.mean();
}
if (reduction == "sum") {
*output_y = output.sum();
}
if (reduction == "none") {
for (int64_t t = 0; t < batch_size; t++) {
*(output_y + t) = output_data[t];
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MultiMarginLossCpuKernel::MultiMarginLossComputeFP16(CpuKernelContext &ctx) {
constexpr int SERV_TYPE_BRWD = 1;
constexpr int SERV_TYPE_SET = 2;
constexpr int ADULT_AGE = 4;
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_target = ctx.Input(1)->GetTensorShape()->GetDimSizes();
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input_target = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
T *input_weight = nullptr;
bool weight_defined_ = (ctx.GetInputsSize() == 3);
if (weight_defined_) {
input_weight = reinterpret_cast<T *>(ctx.Input(SERV_TYPE_SET)->GetData());
int64_t weight_length = ctx.Input(2)->NumElements();
int64_t x_length = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
if (weight_length < x_length) {
for (int64_t i = 0; i < x_length - weight_length; i++) {
input_weight[i + weight_length] = static_cast<T>(0);
}
}
}
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
AttrValue *Attr_p = ctx.GetAttr("p");
int p = (Attr_p == nullptr) ? 1 : Attr_p->GetInt();
if (p != SERV_TYPE_BRWD && p != SERV_TYPE_SET) {
KERNEL_LOG_ERROR("MultiMarginLoss kernel attr p should be 1 or 2.");
return KERNEL_STATUS_PARAM_INVALID;
}
AttrValue *Attr_margin = ctx.GetAttr("margin");
float margin = static_cast<float>((Attr_margin == nullptr) ? 1 : Attr_margin->GetFloat());
AttrValue *Attr_red = ctx.GetAttr("reduction");
std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
int64_t batch_size = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
int64_t dims = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
Eigen::Array<float, Eigen::Dynamic, 1> output(batch_size, 1);
output.setZero();
auto output_data = output.data();
int64_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, (int64_t)aicpu::CpuKernelUtils::GetCPUNum(ctx));
auto shard_multi_margin_loss = [&](size_t start, size_t end) {
int64_t once_compute_thread_size = end - start;
Eigen::Array<float, Eigen::Dynamic, 1> cacl(dims, 1);
auto cacl_data = cacl.data();
cacl.setZero();
if (dims == 0) {
KERNEL_LOG_ERROR("dims could not be 0.");
}
for (int64_t m = 0; m < (once_compute_thread_size) / dims; m++) {
int64_t i = start / dims;
for (int64_t d = 0; d < dims; d++) {
if (d == input_target[i]) {
continue;
}
cacl_data[d] =
margin + static_cast<float>(input_x[start + d]) - static_cast<float>(input_x[start + input_target[i]]);
if (cacl_data[d] > 0) {
cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
if (weight_defined_) {
cacl_data[d] *= static_cast<float>(input_weight[input_target[i]]);
}
output_data[i] += cacl_data[d];
}
}
output_data[i] = output_data[i] / static_cast<float>(dims);
start += dims;
}
};
if ((ctx.Input(0)->NumElements()) * sizeof(T) <= kParallelDataNum) {
Eigen::Array<float, Eigen::Dynamic, 1> cacl(dims, 1);
auto cacl_data = cacl.data();
cacl.setZero();
float sum = 0;
for (int64_t i = 0; i < batch_size; i++) {
int64_t target_idx = input_target[i];
sum = 0;
cacl.setZero();
for (int64_t d = 0; d < dims; d++) {
if (d == target_idx) {
continue;
}
cacl_data[d] =
margin + static_cast<float>(input_x[i * dims + d]) - static_cast<float>(input_x[i * dims + target_idx]);
if (cacl_data[d] > 0) {
cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
if (weight_defined_) {
cacl_data[d] *= static_cast<float>(input_weight[target_idx]);
}
sum += cacl_data[d];
}
}
sum = sum / static_cast<float>(dims);
output_data[i] = sum;
}
} else {
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
CpuKernelUtils::ParallelFor(ctx, ctx.Input(0)->NumElements(), dims * ADULT_AGE * (batch_size / max_core_num + 1),
shard_multi_margin_loss);
}
if (reduction == "mean") {
*output_y = static_cast<T>(output.mean());
}
if (reduction == "sum") {
*output_y = static_cast<T>(output.sum());
}
if (reduction == "none") {
for (int64_t t = 0; t < batch_size; t++) {
*(output_y + t) = static_cast<T>(output_data[t]);
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMultiMarginLoss, MultiMarginLossCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,41 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MULIT_MARGIN_LOSS_H_
#define AICPU_KERNELS_NORMALIZED_MULTI_MARGIN_LOSS_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class MultiMarginLossCpuKernel : public CpuKernel {
public:
MultiMarginLossCpuKernel() = default;
~MultiMarginLossCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
static uint32_t MultiMarginLossCheck(CpuKernelContext &ctx);
template <typename T>
static uint32_t MultiMarginLossCompute(CpuKernelContext &ctx);
template <typename T>
static uint32_t MultiMarginLossComputeFP16(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif