forked from mindspore-Ecosystem/mindspore
!48453 merge canndev code to mindspore
Merge pull request !48453 from 沈竞兴/canndev_merge
This commit is contained in:
commit
9493cc3534
|
@ -135,4 +135,5 @@
|
|||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/comma"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "runtime/indentation_namespace"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/line_length"
|
||||
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/semicolon"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "readability/nolint"
|
||||
|
|
|
@ -350,3 +350,8 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
|
|||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/resize_bicubic_grad.cc:aicpu::ResizeBicubicGrad
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_max.cc:aicpu::SegmentMaxCpuKernel::SegmentMaxCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/extract_glimpse.cc:aicpu::ExtractGlimpseCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_grad_with_argmax.cc:aicpu::MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_power.cc:aicpu::MatrixPowerCpuKernel::ComputeKernel
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d_grad.cc:aicpu::MaxUnpool2DGradCpuKernel::MaxUnpool2DGradCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_with_argmax.cc:aicpu::MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/layer_norm_grad_grad.cc:aicpu::LayerNormGradGradCpuKernel::LayerNormGradGradCompute
|
||||
|
|
|
@ -0,0 +1,226 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "layer_norm_grad_grad.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 3;
|
||||
const uint32_t kInputNum = 8;
|
||||
const char *kLayerNormGradGrad = "LayerNormGradGrad";
|
||||
|
||||
#define LAYERNORMGRADGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX, NUM) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = LayerNormGradGradCompute<TYPE>(CTX, NUM); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("LayerNormGradGrad kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define SWITCH_PARALLEL(SHARD, data_num, thread_num) \
|
||||
if (data_num <= ParallelDataNums) { \
|
||||
for (size_t i = 0; i < thread_num; i++) { \
|
||||
SHARD(i, i + 1); \
|
||||
} \
|
||||
} else { \
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, thread_num, 1, SHARD), \
|
||||
"LayerNormGradGrad ParallelFor Compute failed."); \
|
||||
}
|
||||
|
||||
Eigen::half sqrt(Eigen::half &data) { return Eigen::half_impl::sqrt(data); }
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t LayerNormGradGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"LayerNormGradGrad check input and output number failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
LAYERNORMGRADGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx, 512)
|
||||
LAYERNORMGRADGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx, 4 * 1024)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("LayerNormGradGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LayerNormGradGradCpuKernel::LayerNormGradGradCompute(CpuKernelContext &ctx, size_t ParallelDataNums) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input_dy = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto input_var = reinterpret_cast<T *>(ctx.Input(2)->GetData());
|
||||
auto input_mean = reinterpret_cast<T *>(ctx.Input(3)->GetData());
|
||||
auto input_gamma = reinterpret_cast<T *>(ctx.Input(4)->GetData());
|
||||
auto input_d_dx = reinterpret_cast<T *>(ctx.Input(5)->GetData());
|
||||
auto input_d_dg = reinterpret_cast<T *>(ctx.Input(6)->GetData());
|
||||
auto input_d_db = reinterpret_cast<T *>(ctx.Input(7)->GetData());
|
||||
|
||||
auto output_sopd_x = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto output_sopd_dy = reinterpret_cast<T *>(ctx.Output(1)->GetData());
|
||||
auto output_sopd_g = reinterpret_cast<T *>(ctx.Output(2)->GetData());
|
||||
|
||||
size_t num = static_cast<size_t>(ctx.Input(0)->NumElements());
|
||||
size_t g_num = static_cast<size_t>(ctx.Input(4)->NumElements());
|
||||
size_t mean_num = static_cast<size_t>(ctx.Input(3)->NumElements());
|
||||
|
||||
KERNEL_CHECK_FALSE((g_num > 0), KERNEL_STATUS_PARAM_INVALID, "gamma should not be empty");
|
||||
|
||||
T *inv_std = new T[mean_num];
|
||||
for (size_t i = 0; i < mean_num; i++) {
|
||||
if (input_var[i] <= T(0)) {
|
||||
KERNEL_LOG_ERROR("variance must be greater than zero");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
inv_std[i] = T(1) / sqrt(input_var[i]);
|
||||
}
|
||||
|
||||
T *x_hat = new T[num];
|
||||
T *dy_gamma = new T[num];
|
||||
T *sum1 = new T[mean_num];
|
||||
std::fill_n(sum1, mean_num, T(0));
|
||||
T *sum2 = new T[mean_num];
|
||||
std::fill_n(sum2, mean_num, T(0));
|
||||
T *sum3 = new T[mean_num];
|
||||
std::fill_n(sum3, mean_num, T(0));
|
||||
T *sum4 = new T[mean_num];
|
||||
std::fill_n(sum4, mean_num, T(0));
|
||||
|
||||
auto shard_inner_mean = [&](size_t start, size_t end) {
|
||||
for (size_t sum_idx = start; sum_idx < end; sum_idx++) {
|
||||
for (size_t g_idx = 0; g_idx < g_num; g_idx++) {
|
||||
size_t i = g_idx + sum_idx * g_num; // value of sum_idx = i / g_num;
|
||||
sum1[sum_idx] -= inv_std[sum_idx] * input_d_dx[i] / static_cast<T>(g_num);
|
||||
;
|
||||
T cur_x_hat = (input_x[i] - input_mean[sum_idx]) * inv_std[sum_idx];
|
||||
x_hat[i] = cur_x_hat;
|
||||
sum2[sum_idx] -= cur_x_hat * inv_std[sum_idx] * input_d_dx[i] / static_cast<T>(g_num);
|
||||
;
|
||||
T cur_dy_gamma = input_dy[i] * input_gamma[g_idx];
|
||||
dy_gamma[i] = cur_dy_gamma;
|
||||
sum3[sum_idx] += cur_dy_gamma / static_cast<T>(g_num);
|
||||
;
|
||||
sum4[sum_idx] += cur_dy_gamma * cur_x_hat / static_cast<T>(g_num);
|
||||
;
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard_inner_mean, num, mean_num);
|
||||
T *sum5 = new T[mean_num];
|
||||
std::fill_n(sum5, mean_num, T(0));
|
||||
T *sum6 = new T[mean_num];
|
||||
std::fill_n(sum6, mean_num, T(0));
|
||||
T *sum7 = new T[mean_num];
|
||||
std::fill_n(sum7, mean_num, T(0));
|
||||
T *part3 = new T[num];
|
||||
|
||||
auto shard_outer_mean = [&](size_t start, size_t end) {
|
||||
for (size_t sum_idx = start; sum_idx < end; sum_idx++) {
|
||||
for (size_t g_idx = 0; g_idx < g_num; g_idx++) {
|
||||
size_t i = g_idx + sum_idx * g_num; // value of sum_idx is i / g_num;
|
||||
T part_sum1 = dy_gamma[i] - sum3[sum_idx] - x_hat[i] * sum4[sum_idx];
|
||||
T part_sum2 = dy_gamma[i] * sum2[sum_idx] - sum4[sum_idx] * input_d_dx[i] * inv_std[sum_idx] +
|
||||
input_dy[i] * input_d_dg[g_idx];
|
||||
sum5[sum_idx] += input_d_dx[i] * part_sum1 / static_cast<T>(g_num);
|
||||
;
|
||||
sum6[sum_idx] += (input_x[i] - input_mean[sum_idx]) * part_sum2 / static_cast<T>(g_num);
|
||||
;
|
||||
T cur_part3 = inv_std[sum_idx] * part_sum2;
|
||||
part3[i] = cur_part3;
|
||||
sum7[sum_idx] -= cur_part3 / static_cast<T>(g_num);
|
||||
;
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard_outer_mean, num, mean_num);
|
||||
if (sum3 != nullptr) {
|
||||
delete[] sum3;
|
||||
}
|
||||
if (sum4 != nullptr) {
|
||||
delete[] sum4;
|
||||
}
|
||||
if (dy_gamma != nullptr) {
|
||||
delete[] dy_gamma;
|
||||
}
|
||||
|
||||
auto shard_input_prop = [&](size_t start, size_t end) {
|
||||
for (size_t sum_idx = start; sum_idx < end; sum_idx++) {
|
||||
for (size_t g_idx = 0; g_idx < g_num; g_idx++) {
|
||||
size_t i = g_idx + sum_idx * g_num; // value of sum_idx is i / g_num;
|
||||
T cur_part4 = -x_hat[i] * inv_std[sum_idx] * inv_std[sum_idx] * (sum5[sum_idx] + sum6[sum_idx]);
|
||||
output_sopd_x[i] = part3[i] + cur_part4 + sum7[sum_idx];
|
||||
T cur_part5 = input_gamma[g_idx] * input_d_dx[i] * inv_std[sum_idx];
|
||||
T cur_part6 = input_gamma[g_idx] * sum1[sum_idx];
|
||||
T cur_part7 = input_gamma[g_idx] * x_hat[i] * sum2[sum_idx];
|
||||
T cur_part8 = x_hat[i] * input_d_dg[g_idx];
|
||||
output_sopd_dy[i] = cur_part5 + cur_part6 + cur_part7 + cur_part8 + input_d_db[g_idx];
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard_input_prop, num, mean_num);
|
||||
if (sum5 != nullptr) {
|
||||
delete[] sum5;
|
||||
}
|
||||
if (sum6 != nullptr) {
|
||||
delete[] sum6;
|
||||
}
|
||||
if (sum7 != nullptr) {
|
||||
delete[] sum7;
|
||||
}
|
||||
std::fill_n(output_sopd_g, g_num, T(0));
|
||||
|
||||
auto shard_param_prop = [&](size_t start, size_t end) {
|
||||
for (size_t g_idx = start; g_idx < end; g_idx++) {
|
||||
for (size_t sum_idx = 0; sum_idx < mean_num; sum_idx++) {
|
||||
size_t i = g_idx + sum_idx * g_num; // value of sum_idx is i / g_num;
|
||||
T cur_part9 = input_dy[i] * x_hat[i] * sum2[sum_idx];
|
||||
T cur_part10 = input_dy[i] * sum1[sum_idx];
|
||||
T cur_part11 = input_dy[i] * input_d_dx[i] * inv_std[sum_idx];
|
||||
output_sopd_g[g_idx] += cur_part9 + cur_part10 + cur_part11;
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard_param_prop, num, g_num);
|
||||
|
||||
if (sum1 != nullptr) {
|
||||
delete[] sum1;
|
||||
}
|
||||
if (sum2 != nullptr) {
|
||||
delete[] sum2;
|
||||
}
|
||||
if (inv_std != nullptr) {
|
||||
delete[] inv_std;
|
||||
}
|
||||
if (x_hat != nullptr) {
|
||||
delete[] x_hat;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kLayerNormGradGrad, LayerNormGradGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LAYERNORMGRADGRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_LAYERNORMGRADGRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class LayerNormGradGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LayerNormGradGradCpuKernel() = default;
|
||||
~LayerNormGradGradCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
static uint32_t LayerNormGradGradCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
static uint32_t LayerNormGradGradCompute(CpuKernelContext &ctx, size_t ParallelDataNums);
|
||||
};
|
||||
} // namespace aicpu
|
||||
|
||||
#endif
|
|
@ -0,0 +1,256 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "log.h"
|
||||
|
||||
#include "cmath"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *kLog = "Log";
|
||||
|
||||
#define LOG_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = LogCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Log kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define LOG_COMPUTE_CASE2(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = LogCompute2(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Log kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define LOG_COMPUTE_CASE3(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = LogCompute3<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Log kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t LogCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLog);
|
||||
KERNEL_HANDLE_ERROR(LogCheck(ctx), "[%s] check params failed.", kLog);
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
LOG_COMPUTE_CASE2(DT_FLOAT16, Eigen::half, ctx)
|
||||
LOG_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
LOG_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
LOG_COMPUTE_CASE3(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
LOG_COMPUTE_CASE3(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Log kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t LogCpuKernel::LogCheck(CpuKernelContext &ctx) {
|
||||
auto input_0 = ctx.Input(0);
|
||||
auto output_0 = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
|
||||
KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed")
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
|
||||
std::vector<int64_t> shape_x = input_0->GetTensorShape()->GetDimSizes();
|
||||
size_t shape_size = shape_x.size();
|
||||
KERNEL_CHECK_FALSE((shape_size > 0), KERNEL_STATUS_PARAM_INVALID, "Input must be at least rank 1, got [%zu].",
|
||||
shape_x.size())
|
||||
KERNEL_CHECK_FALSE((shape_x[shape_size - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input last dimension must be at least 1.")
|
||||
AttrValue *base_ptr = ctx.GetAttr("base");
|
||||
KERNEL_CHECK_NULLPTR(base_ptr, KERNEL_STATUS_PARAM_INVALID, "Get attr base failed.");
|
||||
float base_ = base_ptr->GetFloat();
|
||||
KERNEL_CHECK_FALSE(((base_ > 0 && base_ != 1.0) || base_ == -1.0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Attr base must be -1.0 or base > 0 and base is not "
|
||||
"equal to 1 , but got attr base[%lld]",
|
||||
base_);
|
||||
AttrValue *scale_ptr = ctx.GetAttr("scale");
|
||||
KERNEL_CHECK_NULLPTR(scale_ptr, KERNEL_STATUS_PARAM_INVALID, "Get attr scale failed.");
|
||||
AttrValue *shift_ptr = ctx.GetAttr("shift");
|
||||
KERNEL_CHECK_NULLPTR(shift_ptr, KERNEL_STATUS_PARAM_INVALID, "Get attr shift failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogCpuKernel::LogCompute(CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
|
||||
AttrValue *base_ptr = ctx.GetAttr("base");
|
||||
T base_;
|
||||
base_ = static_cast<T>(base_ptr->GetFloat());
|
||||
if (base_ == static_cast<T>(-1.0)) {
|
||||
base_ = static_cast<T>(exp(1.0));
|
||||
}
|
||||
AttrValue *scale_ptr = ctx.GetAttr("scale");
|
||||
T scale_;
|
||||
scale_ = static_cast<T>(scale_ptr->GetFloat());
|
||||
AttrValue *shift_ptr = ctx.GetAttr("shift");
|
||||
T shift_;
|
||||
shift_ = static_cast<T>(shift_ptr->GetFloat());
|
||||
|
||||
size_t data_num = ctx.Input(0)->NumElements();
|
||||
if (data_num <= 4 * 1024) {
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
if (*(input_x + i) <= static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
*(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shard_log = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
if (*(input_x + i) <= static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
*(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
|
||||
}
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log),
|
||||
"Log Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t LogCpuKernel::LogCompute2(CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<Eigen::half *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<Eigen::half *>(ctx.Output(0)->GetData());
|
||||
size_t data_num = ctx.Input(0)->NumElements();
|
||||
for (uint64_t i = 0; i < data_num; i++) {
|
||||
if (*(input_x + i) <= static_cast<Eigen::half>(0)) {
|
||||
KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
AttrValue *base_ptr = ctx.GetAttr("base");
|
||||
Eigen::half base_;
|
||||
base_ = static_cast<Eigen::half>(base_ptr->GetFloat());
|
||||
if (base_ == static_cast<Eigen::half>(-1.0)) {
|
||||
base_ = static_cast<Eigen::half>(exp(1.0));
|
||||
}
|
||||
AttrValue *scale_ptr = ctx.GetAttr("scale");
|
||||
Eigen::half scale_;
|
||||
scale_ = static_cast<Eigen::half>(scale_ptr->GetFloat());
|
||||
AttrValue *shift_ptr = ctx.GetAttr("shift");
|
||||
Eigen::half shift_;
|
||||
shift_ = static_cast<Eigen::half>(shift_ptr->GetFloat());
|
||||
|
||||
typedef Eigen::Array<Eigen::half, Eigen::Dynamic, Eigen::Dynamic> ArrayxXd;
|
||||
ArrayxXd array_x(1, data_num);
|
||||
ArrayxXd array_y(1, data_num);
|
||||
ArrayxXd array_z(1, 1);
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
array_x(0, i) = *(input_x + i);
|
||||
}
|
||||
array_x = array_x * scale_;
|
||||
array_x = array_x + shift_;
|
||||
array_y = array_x.log();
|
||||
array_z(0, 0) = base_;
|
||||
array_z = array_z.log();
|
||||
if (data_num <= 8 * 1024) {
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
*(output_y + i) = array_y(0, i) / array_z(0, 0);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shard_log = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
*(output_y + i) = array_y(0, i) / array_z(0, 0);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log),
|
||||
"Log Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogCpuKernel::LogCompute3(CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
size_t data_num = ctx.Input(0)->NumElements();
|
||||
AttrValue *base_ptr = ctx.GetAttr("base");
|
||||
T base_;
|
||||
base_ = static_cast<T>(base_ptr->GetFloat());
|
||||
if (base_ == static_cast<T>(-1.0)) {
|
||||
base_ = static_cast<T>(exp(1.0));
|
||||
}
|
||||
AttrValue *scale_ptr = ctx.GetAttr("scale");
|
||||
T scale_;
|
||||
scale_ = static_cast<T>(scale_ptr->GetFloat());
|
||||
AttrValue *shift_ptr = ctx.GetAttr("shift");
|
||||
T shift_;
|
||||
shift_ = static_cast<T>(shift_ptr->GetFloat());
|
||||
|
||||
if (data_num <= 4 * 1024) {
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
if (*(input_x + i) == static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
*(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shard_log = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
if (*(input_x + i) == static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("[%llu] must be at least more than 0.", i);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
*(output_y + i) = std::log(*(input_x + i) * scale_ + shift_) / std::log(base_);
|
||||
}
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log),
|
||||
"Log Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kLog, LogCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LOG_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_LOG_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace aicpu {
|
||||
class LogCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LogCpuKernel() = default;
|
||||
~LogCpuKernel() override = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t LogCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogCompute(CpuKernelContext &ctx);
|
||||
|
||||
uint32_t LogCompute2(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogCompute3(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,160 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "logspace.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
constexpr uint32_t kLogSpaceInputNum = 2;
|
||||
constexpr uint32_t kLogSpaceOutputNum = 1;
|
||||
const char *kLogSpace = "LogSpace";
|
||||
|
||||
#define LOGSPACE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = LogSpaceCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("LogSpace kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t LogSpaceCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kLogSpaceInputNum, kLogSpaceOutputNum), "[%s] check input and output failed.",
|
||||
kLogSpace);
|
||||
KERNEL_HANDLE_ERROR(LogSpaceCheck(ctx), "[%s] check params failed.", kLogSpace);
|
||||
DataType data_type = ctx.Output(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
LOGSPACE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
LOGSPACE_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("LogSpace kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t LogSpaceCpuKernel::LogSpaceCheck(CpuKernelContext &ctx) {
|
||||
// get Attr steps_attr
|
||||
AttrValue *steps_attr_ptr = ctx.GetAttr("steps");
|
||||
if (steps_attr_ptr) {
|
||||
int64_t steps_data = steps_attr_ptr->GetInt();
|
||||
KERNEL_CHECK_FALSE((steps_data >= 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Attr [steps] data has to be greater than or equal to 0.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogSpaceCpuKernel::LogSpaceCompute(CpuKernelContext &ctx) {
|
||||
DataType data_type_in = ctx.Input(0)->GetDataType();
|
||||
DataType data_type = ctx.Output(0)->GetDataType();
|
||||
if (data_type_in == data_type) {
|
||||
auto *input_start_ = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto *input_end_ = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto input_start = static_cast<double>(input_start_[0]);
|
||||
auto input_end = static_cast<double>(input_end_[0]);
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
AttrValue *steps_data = ctx.GetAttr("steps");
|
||||
AttrValue *base_data = ctx.GetAttr("base");
|
||||
int64_t steps_value = 100;
|
||||
int base_value = 10;
|
||||
if (steps_data) {
|
||||
steps_value = steps_data->GetInt();
|
||||
}
|
||||
if (base_data) {
|
||||
base_value = base_data->GetInt();
|
||||
}
|
||||
if (steps_value != 1) {
|
||||
double b = (input_end - input_start) / (steps_value - 1);
|
||||
double q = pow(base_value, b);
|
||||
double input_start_value = input_start;
|
||||
for (int64_t i = 0; i < steps_value; i++) {
|
||||
double end_num = pow(base_value, input_start_value) * pow(q, i);
|
||||
*(output_y + i) = static_cast<T>(end_num);
|
||||
}
|
||||
}
|
||||
if (steps_value == 1) {
|
||||
double end_num = pow(base_value, double(input_start));
|
||||
*(output_y) = static_cast<T>(end_num);
|
||||
}
|
||||
} else if (data_type_in == DT_FLOAT) {
|
||||
auto *input_start_ = reinterpret_cast<float *>(ctx.Input(0)->GetData());
|
||||
auto *input_end_ = reinterpret_cast<float *>(ctx.Input(1)->GetData());
|
||||
auto input_start = static_cast<double>(input_start_[0]);
|
||||
auto input_end = static_cast<double>(input_end_[0]);
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
AttrValue *steps_data = ctx.GetAttr("steps");
|
||||
AttrValue *base_data = ctx.GetAttr("base");
|
||||
int64_t steps_value = 100;
|
||||
int base_value = 10;
|
||||
if (steps_data) {
|
||||
steps_value = steps_data->GetInt();
|
||||
}
|
||||
if (base_data) {
|
||||
base_value = base_data->GetInt();
|
||||
}
|
||||
if (steps_value != 1) {
|
||||
double b = (input_end - input_start) / (steps_value - 1);
|
||||
double q = pow(base_value, b);
|
||||
double input_start_value = input_start;
|
||||
for (int64_t i = 0; i < steps_value; i++) {
|
||||
double end_num = pow(base_value, input_start_value) * pow(q, i);
|
||||
*(output_y + i) = static_cast<T>(end_num);
|
||||
}
|
||||
}
|
||||
if (steps_value == 1) {
|
||||
double end_num = pow(base_value, double(input_start));
|
||||
*(output_y) = static_cast<T>(end_num);
|
||||
}
|
||||
} else if (data_type_in == DT_FLOAT16) {
|
||||
auto *input_start_ = reinterpret_cast<Eigen::half *>(ctx.Input(0)->GetData());
|
||||
auto *input_end_ = reinterpret_cast<Eigen::half *>(ctx.Input(1)->GetData());
|
||||
auto input_start = static_cast<double>(input_start_[0]);
|
||||
auto input_end = static_cast<double>(input_end_[0]);
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
AttrValue *steps_data = ctx.GetAttr("steps");
|
||||
AttrValue *base_data = ctx.GetAttr("base");
|
||||
int64_t steps_value = 100;
|
||||
int base_value = 10;
|
||||
if (steps_data) {
|
||||
steps_value = steps_data->GetInt();
|
||||
}
|
||||
if (base_data) {
|
||||
base_value = base_data->GetInt();
|
||||
}
|
||||
if (steps_value != 1) {
|
||||
double b = (input_end - input_start) / (steps_value - 1);
|
||||
double q = pow(base_value, b);
|
||||
for (int64_t i = 0; i < steps_value; i++) {
|
||||
double end_num = pow(base_value, input_start) * pow(q, i);
|
||||
*(output_y + i) = static_cast<T>(end_num);
|
||||
}
|
||||
}
|
||||
if (steps_value == 1) {
|
||||
double end_num = pow(base_value, double(input_start));
|
||||
*(output_y) = static_cast<T>(end_num);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kLogSpace, LogSpaceCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LOG1P_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_LOG1P_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class LogSpaceCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LogSpaceCpuKernel() = default;
|
||||
~LogSpaceCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t LogSpaceCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogSpaceCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,126 @@
|
|||
/**
|
||||
* Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "matrix_inverse.h"
|
||||
#include <complex>
|
||||
#include <vector>
|
||||
#include "Eigen/Core"
|
||||
#include "Eigen/LU"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *kMatrixInverse = "MatrixInverse";
|
||||
// if the data size is larger than the value, call ParallelFor() func
|
||||
constexpr int64_t kParallelDataNums = 1 * 1024;
|
||||
|
||||
#define MATRIXINVERSE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MatrixInverseCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("MatrixInverse kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MatrixInverseCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixInverse check input and output number failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
MATRIXINVERSE_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
MATRIXINVERSE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
MATRIXINVERSE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
MATRIXINVERSE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MatrixInverse kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixInverseCpuKernel::MatrixInverseCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input = ctx.Input(0);
|
||||
T *input_ptr = reinterpret_cast<T *>(input->GetData());
|
||||
Tensor *output = ctx.Output(0);
|
||||
T *output_ptr = reinterpret_cast<T *>(output->GetData());
|
||||
// Judge whether the input shape matches
|
||||
auto shape = input->GetTensorShape();
|
||||
uint64_t data_size = input->GetDataSize();
|
||||
std::vector<int64_t> dims = shape->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE((dims.size() >= 2 && (*(dims.end() - 1) == *(dims.end() - 2))), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input Shape is wrong");
|
||||
auto last_dimsize = *(dims.end() - 1);
|
||||
// Output length
|
||||
auto input_num = input->NumElements();
|
||||
size_t matrix_size = last_dimsize * last_dimsize;
|
||||
// Number of matrices
|
||||
size_t matrix_num = input_num / matrix_size;
|
||||
// Store two-dimensional array of data for slicing
|
||||
std::vector<std::vector<T>> temp(matrix_num, std::vector<T>(matrix_size));
|
||||
for (size_t i = 0; i < matrix_num; i++) {
|
||||
for (size_t j = 0; j < matrix_size; j++) {
|
||||
temp[i][j] = *(input_ptr + i * matrix_size + j);
|
||||
}
|
||||
}
|
||||
// Gets the value of the property adjoint
|
||||
AttrValue *adjoint_attr = ctx.GetAttr("adjoint");
|
||||
bool adjoint__ = adjoint_attr->GetBool();
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (size_t i = 0; i < matrix_num; i++) {
|
||||
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_input(temp[i].data(), last_dimsize,
|
||||
last_dimsize);
|
||||
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_output(output_ptr + i * matrix_size,
|
||||
last_dimsize, last_dimsize);
|
||||
if (adjoint__) {
|
||||
eigen_input = eigen_input.adjoint().eval();
|
||||
}
|
||||
Eigen::FullPivLU<Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>> lu(eigen_input);
|
||||
eigen_output = lu.inverse();
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > matrix_num) {
|
||||
max_core_num = matrix_num;
|
||||
}
|
||||
auto sharedcompute = [&](size_t start, size_t end) {
|
||||
for (auto i = start; i < end; i++) {
|
||||
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_input(temp[i].data(), last_dimsize,
|
||||
last_dimsize);
|
||||
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> eigen_output(output_ptr + i * matrix_size,
|
||||
last_dimsize, last_dimsize);
|
||||
if (adjoint__) {
|
||||
eigen_input = eigen_input.adjoint().eval();
|
||||
}
|
||||
Eigen::FullPivLU<Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>> lu(eigen_input);
|
||||
eigen_output = lu.inverse();
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharedcompute),
|
||||
"Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMatrixInverse, MatrixInverseCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MATRIXINVERSE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MATRIXINVERSE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MatrixInverseCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MatrixInverseCpuKernel() = default;
|
||||
~MatrixInverseCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static uint32_t MatrixInverseCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,198 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "matrix_power.h"
|
||||
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include <Eigen/Dense>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 1;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kMatrixPower = "MatrixPower";
|
||||
const int64_t kParallelDataNum = 4 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MatrixPowerCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixPower normal check failed.");
|
||||
auto x_type = ctx.Input(0)->GetDataType();
|
||||
if (x_type == DT_FLOAT) {
|
||||
return ComputeKernel<float>(ctx);
|
||||
} else {
|
||||
return ComputeKernel<Eigen::half>(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixPowerCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
|
||||
Tensor *input_x = ctx.Input(0);
|
||||
Tensor *output_y = ctx.Output(0);
|
||||
AttrValue *power = ctx.GetAttr("n");
|
||||
int64_t powervalue = power->GetInt();
|
||||
auto x_shape = input_x->GetTensorShape();
|
||||
size_t batch = x_shape->GetDimSize(0);
|
||||
size_t dim = x_shape->GetDimSize(1);
|
||||
auto x_ptr = reinterpret_cast<T *>(input_x->GetData());
|
||||
auto y_ptr = reinterpret_cast<T *>(output_y->GetData());
|
||||
int64_t data_num = ctx.Input(0)->NumElements() * sizeof(T);
|
||||
|
||||
if (powervalue < 0) {
|
||||
powervalue = -powervalue;
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > batch) {
|
||||
max_core_num = batch;
|
||||
}
|
||||
if (max_core_num == 0) {
|
||||
max_core_num = 1;
|
||||
}
|
||||
int64_t NotInvertible = -1;
|
||||
auto shard_matrix_power = [&](size_t start, size_t end) {
|
||||
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
|
||||
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
|
||||
for (size_t i = start; i < end; i++) {
|
||||
for (size_t p = 0; p < dim; p++) {
|
||||
for (size_t q = 0; q < dim; q++) {
|
||||
B(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
|
||||
}
|
||||
}
|
||||
Eigen::FullPivLU<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> LU(B);
|
||||
if (!(LU.isInvertible())) {
|
||||
NotInvertible = i;
|
||||
}
|
||||
A = LU.inverse();
|
||||
B.setIdentity();
|
||||
int64_t n = powervalue;
|
||||
while (n > 0) {
|
||||
if (n % 2 == 1) {
|
||||
B = B * A;
|
||||
}
|
||||
n = n / 2;
|
||||
A = A * A;
|
||||
}
|
||||
for (size_t p = 0; p < dim; p++) {
|
||||
for (size_t q = 0; q < dim; q++) {
|
||||
y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, shard_matrix_power);
|
||||
KERNEL_CHECK_FALSE((NotInvertible < 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The %d-th matrix of input tensor is singular, but got n is negative.", NotInvertible)
|
||||
} else {
|
||||
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
|
||||
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
|
||||
for (size_t i = 0; i < batch; i++) {
|
||||
for (size_t p = 0; p < dim; p++) {
|
||||
for (size_t q = 0; q < dim; q++) {
|
||||
B(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
|
||||
}
|
||||
}
|
||||
Eigen::FullPivLU<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> LU(B);
|
||||
KERNEL_CHECK_FALSE((LU.isInvertible()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The %d-th matrix of input tensor is singular, but got n is negative.", i)
|
||||
A = LU.inverse();
|
||||
B.setIdentity();
|
||||
int64_t n = powervalue;
|
||||
while (n > 0) {
|
||||
if (n % 2 == 1) {
|
||||
B = B * A;
|
||||
}
|
||||
n = n / 2;
|
||||
A = A * A;
|
||||
}
|
||||
for (size_t p = 0; p < dim; p++) {
|
||||
for (size_t q = 0; q < dim; q++) {
|
||||
y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > batch) {
|
||||
max_core_num = batch;
|
||||
}
|
||||
if (max_core_num == 0) {
|
||||
max_core_num = 1;
|
||||
}
|
||||
auto shard_matrix_power = [&](size_t start, size_t end) {
|
||||
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
|
||||
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
|
||||
for (size_t i = start; i < end; i++) {
|
||||
for (size_t p = 0; p < dim; p++) {
|
||||
for (size_t q = 0; q < dim; q++) {
|
||||
A(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
|
||||
}
|
||||
}
|
||||
B.setIdentity();
|
||||
int64_t n = powervalue;
|
||||
while (n > 0) {
|
||||
if (n % 2 == 1) {
|
||||
B = B * A;
|
||||
}
|
||||
n = n / 2;
|
||||
A = A * A;
|
||||
}
|
||||
for (size_t p = 0; p < dim; p++) {
|
||||
for (size_t q = 0; q < dim; q++) {
|
||||
y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, shard_matrix_power);
|
||||
} else {
|
||||
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> A(dim, dim);
|
||||
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> B(dim, dim);
|
||||
for (size_t i = 0; i < batch; i++) {
|
||||
for (size_t p = 0; p < dim; p++) {
|
||||
for (size_t q = 0; q < dim; q++) {
|
||||
A(p, q) = (float)x_ptr[i * dim * dim + p * dim + q];
|
||||
}
|
||||
}
|
||||
B.setIdentity();
|
||||
int64_t n = powervalue;
|
||||
while (n > 0) {
|
||||
if (n % 2 == 1) {
|
||||
B = B * A;
|
||||
}
|
||||
n = n / 2;
|
||||
A = A * A;
|
||||
}
|
||||
for (size_t p = 0; p < dim; p++) {
|
||||
for (size_t q = 0; q < dim; q++) {
|
||||
y_ptr[i * dim * dim + p * dim + q] = (T)B(p, q);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMatrixPower, MatrixPowerCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,36 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_POWER_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MATRIX_POWER_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MatrixPowerCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MatrixPowerCpuKernel() = default;
|
||||
~MatrixPowerCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static uint32_t ComputeKernel(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,160 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "matrix_solve.h"
|
||||
|
||||
#include <complex>
|
||||
#include "Eigen/Core"
|
||||
#include "Eigen/LU"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kMatrixSolve = "MatrixSolve";
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const int64_t kParallelDataNumSameShape = 8 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 128 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MatrixSolveCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixSolve check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MatrixSolveDataAndTypeCheck(ctx), "MatrixSolve check input and output params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
return MatrixSolveCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return MatrixSolveCompute<double>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return MatrixSolveCompute<std::complex<float>>(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return MatrixSolveCompute<std::complex<double>>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MatrixSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MatrixSolveCpuKernel::MatrixSolveDataAndTypeCheck(CpuKernelContext &ctx) {
|
||||
DataType matrix_type = ctx.Input(0)->GetDataType();
|
||||
DataType rhs_type = ctx.Input(1)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((matrix_type == rhs_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(matrix_type).c_str(), DTypeStr(rhs_type).c_str())
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixSolveCpuKernel::MatrixSolveCompute(CpuKernelContext &ctx) {
|
||||
auto input0_tensor = ctx.Input(0);
|
||||
auto input0_tensor_shape = input0_tensor->GetTensorShape();
|
||||
auto input1_tensor = ctx.Input(1);
|
||||
auto input1_tensor_shape = input1_tensor->GetTensorShape();
|
||||
auto input0_data = reinterpret_cast<T *>(input0_tensor->GetData());
|
||||
auto input1_data = reinterpret_cast<T *>(input1_tensor->GetData());
|
||||
auto input0_shape = input0_tensor_shape->GetDimSizes();
|
||||
int32_t input0_dims = input0_tensor_shape->GetDims();
|
||||
int32_t input1_dims = input1_tensor_shape->GetDims();
|
||||
int64_t m = input0_shape[input0_dims - 1];
|
||||
int64_t size_mm = m * m;
|
||||
|
||||
KERNEL_CHECK_FALSE((input0_shape[input0_dims - 1] == input0_shape[input0_dims - 2]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[matrix] must be a square matrix")
|
||||
KERNEL_CHECK_FALSE((input1_dims >= 2), KERNEL_STATUS_PARAM_INVALID, "Input[rhs] must be a matrix")
|
||||
KERNEL_CHECK_FALSE(
|
||||
(input0_tensor_shape->GetDimSize(input0_dims - 1) == input1_tensor_shape->GetDimSize(input1_dims - 2)),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Input matrix and rhs are incompatible")
|
||||
|
||||
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartixXd;
|
||||
auto adjoint = ctx.GetAttr("adjoint")->GetBool();
|
||||
auto input1_shape = input1_tensor_shape->GetDimSizes();
|
||||
int64_t k = input1_shape[input1_dims - 1];
|
||||
auto output_tensor = ctx.Output(0);
|
||||
auto output_data = reinterpret_cast<T *>(output_tensor->GetData());
|
||||
|
||||
if (size_mm > 0) {
|
||||
size_t matrix_num = ctx.Input(0)->NumElements() / size_mm;
|
||||
int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
|
||||
if (data_size >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (data_size <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
// 若AI CPU中核数大于矩阵个数,以矩阵个数作为max_core_num
|
||||
if (max_core_num > matrix_num) {
|
||||
max_core_num = matrix_num;
|
||||
}
|
||||
auto sharder_matrix_solve = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
Eigen::Map<MartixXd> input0(input0_data + i * m * m, m, m);
|
||||
Eigen::Map<MartixXd> input1(input1_data + i * m * k, m, k);
|
||||
Eigen::Map<MartixXd> output(output_data + i * m * k, m, k);
|
||||
if (input0.rows() == 0 || input0.cols() == 0 || input1.cols() == 0) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
Eigen::PartialPivLU<MartixXd> lu_decomposition(input0.rows());
|
||||
if (adjoint) {
|
||||
lu_decomposition.compute(input0.adjoint());
|
||||
} else {
|
||||
lu_decomposition.compute(input0);
|
||||
}
|
||||
using RealScalar = typename Eigen::NumTraits<T>::Real;
|
||||
RealScalar pivot = lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
|
||||
KERNEL_CHECK_FALSE((pivot > RealScalar(0)), KERNEL_STATUS_PARAM_INVALID, "Input matrix is not invertible");
|
||||
output.noalias() = lu_decomposition.solve(input1);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharder_matrix_solve),
|
||||
"Matrix Solve Compute failed");
|
||||
|
||||
} else {
|
||||
for (size_t i = 0; i < matrix_num; i++) {
|
||||
Eigen::Map<MartixXd> input0(input0_data + i * m * m, m, m);
|
||||
Eigen::Map<MartixXd> input1(input1_data + i * m * k, m, k);
|
||||
Eigen::Map<MartixXd> output(output_data + i * m * k, m, k);
|
||||
if (input0.rows() == 0 || input0.cols() == 0 || input1.cols() == 0) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
Eigen::PartialPivLU<MartixXd> lu_decomposition(input0.rows());
|
||||
if (adjoint) {
|
||||
lu_decomposition.compute(input0.adjoint());
|
||||
} else {
|
||||
lu_decomposition.compute(input0);
|
||||
}
|
||||
using RealScalar = typename Eigen::NumTraits<T>::Real;
|
||||
RealScalar pivot = lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
|
||||
KERNEL_CHECK_FALSE((pivot > RealScalar(0)), KERNEL_STATUS_PARAM_INVALID, "Input matrix is not invertible");
|
||||
|
||||
output.noalias() = lu_decomposition.solve(input1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMatrixSolve, MatrixSolveCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MATRIXSOLVE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MATRIXSOLVE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MatrixSolveCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MatrixSolveCpuKernel() = default;
|
||||
~MatrixSolveCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MatrixSolveDataAndTypeCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixSolveCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,315 @@
|
|||
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "max_pool_3d_grad_with_argmax.h"
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 3;
|
||||
const char *kMaxPool3DGradWithArgmax = "MaxPool3DGradWithArgmax";
|
||||
|
||||
#define MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DTYPE, INTYPE, ARGTYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MaxPool3DGradWithArgmaxCompute<INTYPE, ARGTYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("MaxPool3DGradWithArgmax kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MaxPool3DGradWithArgmaxCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
std::vector<std::string> attr_names = {"ksize", "strides", "pads", "dilation"};
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum, attr_names),
|
||||
"MaxPool3DGradWithArgmax check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MaxPool3DGradWithArgmaxParamCheck(ctx), "MaxPool3DGradWithArgmax check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
auto argmax_type = ctx.Input(2)->GetDataType();
|
||||
if (argmax_type == DT_INT32) {
|
||||
switch (data_type) {
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int32_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int32_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int32_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int32_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int32_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int32_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int32_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int32_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int32_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int32_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int32_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel input data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else if (argmax_type == DT_INT64) {
|
||||
switch (data_type) {
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int64_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int64_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int64_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int64_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int64_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int64_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int64_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int64_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int64_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int64_t, ctx)
|
||||
MAX_POOL3D_GRAD_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MaxPool3DGradWithArgmax kernel input data type [%s] not support.",
|
||||
DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else {
|
||||
KERNEL_LOG_ERROR(
|
||||
"MaxPool3DGradWithArgmax kernel input_argmax data type [%s] not "
|
||||
"support.",
|
||||
DTypeStr(argmax_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxParamCheck(CpuKernelContext &ctx) {
|
||||
auto input_x_info = ctx.Input(0);
|
||||
auto input_grads_info = ctx.Input(1);
|
||||
auto input_argmax_info = ctx.Input(2);
|
||||
auto output_y_info = ctx.Output(0);
|
||||
DataType input_x_type = input_x_info->GetDataType();
|
||||
DataType input_grads_type = input_grads_info->GetDataType();
|
||||
DataType out_type = output_y_info->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input_x_type == input_grads_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input x [%s] need be same with "
|
||||
"input grads [%s].",
|
||||
DTypeStr(input_x_type).c_str(), DTypeStr(input_grads_type).c_str())
|
||||
KERNEL_CHECK_FALSE((input_x_type == out_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input x [%s] need be same with "
|
||||
"output [%s].",
|
||||
DTypeStr(input_x_type).c_str(), DTypeStr(out_type).c_str())
|
||||
DataType input_argmax_type = input_argmax_info->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input_argmax_type == DT_INT32) || (input_argmax_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of output argmax:[%s] should be a int32 or int64. ",
|
||||
DTypeStr(input_argmax_type).c_str())
|
||||
|
||||
std::vector<int64_t> dim_vec = input_x_info->GetTensorShape()->GetDimSizes();
|
||||
int64_t dimsize = dim_vec.size();
|
||||
KERNEL_CHECK_FALSE(dimsize == 5, KERNEL_STATUS_PARAM_INVALID, "The dim of input:[%d] should be 5.", dimsize)
|
||||
|
||||
const size_t DIM_SIZE1 = 1;
|
||||
const size_t DIM_SIZE3 = 3;
|
||||
const size_t DIM_SIZE5 = 5;
|
||||
AttrValue *attr_ksize = ctx.GetAttr("ksize");
|
||||
std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
|
||||
KERNEL_CHECK_FALSE(ksizeList.size() == DIM_SIZE1 || ksizeList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The size of ksize:[%d] should be 1 or 3.", ksizeList.size())
|
||||
AttrValue *attr_strides = ctx.GetAttr("strides");
|
||||
std::vector<int64_t> stridesList = attr_strides->GetListInt();
|
||||
KERNEL_CHECK_FALSE(stridesList.size() == DIM_SIZE1 || stridesList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The size of strides:[%d] should be 1 or 3.", stridesList.size())
|
||||
AttrValue *attr_pads = ctx.GetAttr("pads");
|
||||
std::vector<int64_t> padsList = attr_pads->GetListInt();
|
||||
KERNEL_CHECK_FALSE(padsList.size() == DIM_SIZE1 || padsList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The size of pads:[%d] should be 1 or 3.", padsList.size())
|
||||
AttrValue *attr_dilation = ctx.GetAttr("dilation");
|
||||
std::vector<int64_t> dilationList = attr_dilation->GetListInt();
|
||||
KERNEL_CHECK_FALSE(
|
||||
dilationList.size() == DIM_SIZE1 || dilationList.size() == DIM_SIZE3 || dilationList.size() == DIM_SIZE5,
|
||||
KERNEL_STATUS_PARAM_INVALID, "The size of dilation:[%d] should be 1, 3 or 5.", dilationList.size())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"MaxPool3DGradWithArgmaxCpuKernel[%s], input x: size[%llu];"
|
||||
"input grads: size[%llu], input argmax: size[%llu], output y: "
|
||||
"size[%lld].",
|
||||
ctx.GetOpType().c_str(), input_x_info->GetDataSize(), input_grads_info->GetDataSize(),
|
||||
input_argmax_info->GetDataSize(), output_y_info->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
void MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxSingleCompute(
|
||||
T *input_grad, S *input_argmax, T *output_y, int64_t iD, int64_t iH, int64_t iW, int64_t oD, int64_t oH, int64_t oW,
|
||||
int64_t kD, int64_t kH, int64_t kW, int64_t sD, int64_t sH, int64_t sW, int64_t pD, int64_t pH, int64_t pW,
|
||||
int64_t dD, int64_t dH, int64_t dW) {
|
||||
T *in_grad = input_grad;
|
||||
T *out_y = output_y;
|
||||
S *argmax = input_argmax;
|
||||
|
||||
/* calculate max points */
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t ti, i, j;
|
||||
for (ti = 0; ti < oD; ti++) {
|
||||
for (i = 0; i < oH; i++) {
|
||||
for (j = 0; j < oW; j++) {
|
||||
/* retrieve position of max */
|
||||
int64_t index = ti * oH * oW + i * oW + j;
|
||||
int64_t maxp = argmax[index];
|
||||
|
||||
if (maxp != -1) {
|
||||
/* update gradient */
|
||||
out_y[maxp] += in_grad[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
uint32_t MaxPool3DGradWithArgmaxCpuKernel::MaxPool3DGradWithArgmaxCompute(CpuKernelContext &ctx) {
|
||||
auto input_x_info = ctx.Input(0);
|
||||
auto input_grads_info = ctx.Input(1);
|
||||
auto input_argmax_info = ctx.Input(2);
|
||||
auto output_y_info = ctx.Output(0);
|
||||
auto input_grads = reinterpret_cast<T *>(input_grads_info->GetData());
|
||||
auto input_argmax = reinterpret_cast<S *>(input_argmax_info->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(output_y_info->GetData());
|
||||
AttrValue *attr_ksize = ctx.GetAttr("ksize");
|
||||
std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
|
||||
AttrValue *attr_strides = ctx.GetAttr("strides");
|
||||
std::vector<int64_t> stridesList = attr_strides->GetListInt();
|
||||
AttrValue *attr_pads = ctx.GetAttr("pads");
|
||||
std::vector<int64_t> padsList = attr_pads->GetListInt();
|
||||
AttrValue *attr_dilation = ctx.GetAttr("dilation");
|
||||
std::vector<int64_t> initList = {1, 1, 1, 1, 1};
|
||||
std::vector<int64_t> dilationList = (attr_dilation == nullptr) ? initList : attr_dilation->GetListInt();
|
||||
|
||||
auto input_shape_vec = input_x_info->GetTensorShape()->GetDimSizes();
|
||||
auto output_shape_vec = input_grads_info->GetTensorShape()->GetDimSizes();
|
||||
const int64_t in_width = input_shape_vec[4];
|
||||
const int64_t in_height = input_shape_vec[3];
|
||||
const int64_t in_depth = input_shape_vec[2];
|
||||
const int64_t in_channel = input_shape_vec[1];
|
||||
const int64_t in_batch = input_shape_vec[0];
|
||||
const int64_t out_width = output_shape_vec[4];
|
||||
const int64_t out_height = output_shape_vec[3];
|
||||
const int64_t out_depth = output_shape_vec[2];
|
||||
const size_t DIM_SIZE1 = 1;
|
||||
const size_t DIM_SIZE5 = 5;
|
||||
std::vector<int64_t> ksizeTempList;
|
||||
if (ksizeList.size() == DIM_SIZE1) {
|
||||
ksizeTempList.push_back(ksizeList[0]);
|
||||
ksizeTempList.push_back(ksizeList[0]);
|
||||
ksizeTempList.push_back(ksizeList[0]);
|
||||
} else {
|
||||
ksizeTempList.push_back(ksizeList[0]);
|
||||
ksizeTempList.push_back(ksizeList[1]);
|
||||
ksizeTempList.push_back(ksizeList[2]);
|
||||
}
|
||||
std::vector<int64_t> stridesTempList;
|
||||
if (stridesList.size() == DIM_SIZE1) {
|
||||
stridesTempList.push_back(stridesList[0]);
|
||||
stridesTempList.push_back(stridesList[0]);
|
||||
stridesTempList.push_back(stridesList[0]);
|
||||
} else {
|
||||
stridesTempList.push_back(stridesList[0]);
|
||||
stridesTempList.push_back(stridesList[1]);
|
||||
stridesTempList.push_back(stridesList[2]);
|
||||
}
|
||||
std::vector<int64_t> padsTempList;
|
||||
if (padsList.size() == DIM_SIZE1) {
|
||||
padsTempList.push_back(padsList[0]);
|
||||
padsTempList.push_back(padsList[0]);
|
||||
padsTempList.push_back(padsList[0]);
|
||||
} else {
|
||||
padsTempList.push_back(padsList[0]);
|
||||
padsTempList.push_back(padsList[1]);
|
||||
padsTempList.push_back(padsList[2]);
|
||||
}
|
||||
std::vector<int64_t> dilationTempList;
|
||||
if (dilationList.size() == DIM_SIZE1) {
|
||||
dilationTempList.push_back(dilationList[0]);
|
||||
dilationTempList.push_back(dilationList[0]);
|
||||
dilationTempList.push_back(dilationList[0]);
|
||||
} else if (dilationList.size() == DIM_SIZE5) {
|
||||
dilationTempList.push_back(dilationList[2]);
|
||||
dilationTempList.push_back(dilationList[3]);
|
||||
dilationTempList.push_back(dilationList[4]);
|
||||
} else {
|
||||
dilationTempList.push_back(dilationList[1]);
|
||||
dilationTempList.push_back(dilationList[2]);
|
||||
dilationTempList.push_back(dilationList[3]);
|
||||
}
|
||||
const int64_t k_width = ksizeTempList[2];
|
||||
const int64_t k_height = ksizeTempList[1];
|
||||
const int64_t k_depth = ksizeTempList[0];
|
||||
const int64_t s_width = stridesTempList[2];
|
||||
const int64_t s_height = stridesTempList[1];
|
||||
const int64_t s_depth = stridesTempList[0];
|
||||
const int64_t p_width = padsTempList[2];
|
||||
const int64_t p_height = padsTempList[1];
|
||||
const int64_t p_depth = padsTempList[0];
|
||||
const int64_t d_width = dilationTempList[2];
|
||||
const int64_t d_height = dilationTempList[1];
|
||||
const int64_t d_depth = dilationTempList[0];
|
||||
KERNEL_CHECK_FALSE(k_width / 2 >= p_width && k_height / 2 >= p_height && k_depth / 2 >= p_depth,
|
||||
KERNEL_STATUS_PARAM_INVALID, "pads should be smaller than or equal to half of kernel size.");
|
||||
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
const int64_t batch = in_batch * in_channel;
|
||||
const int64_t in_stride = in_width * in_height * in_depth;
|
||||
const int64_t out_stride = out_width * out_height * out_depth;
|
||||
const int64_t kParallelDataNum = 16 * in_width * in_height * in_depth;
|
||||
const int64_t kParallelDataNumMid = 72 * in_width * in_height * in_depth;
|
||||
const float ZERO = 0.f;
|
||||
int64_t output_num = ctx.Output(0)->NumElements();
|
||||
for (int64_t i = 0; i < output_num; i++) {
|
||||
output_y[i] = static_cast<T>(ZERO);
|
||||
}
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
auto sharder_max_pool3d_grad_with_argmax = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
MaxPool3DGradWithArgmaxSingleCompute(input_grads + i * out_stride, input_argmax + i * out_stride,
|
||||
output_y + i * in_stride, in_depth, in_height, in_width, out_depth,
|
||||
out_height, out_width, k_depth, k_height, k_width, s_depth, s_height,
|
||||
s_width, p_depth, p_height, p_width, d_depth, d_height, d_width);
|
||||
}
|
||||
};
|
||||
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_max_pool3d_grad_with_argmax),
|
||||
"MaxPool3DGradWithArgmax Compute failed.");
|
||||
|
||||
} else {
|
||||
for (int64_t i = 0; i < batch; i++) {
|
||||
MaxPool3DGradWithArgmaxSingleCompute(input_grads + i * out_stride, input_argmax + i * out_stride,
|
||||
output_y + i * in_stride, in_depth, in_height, in_width, out_depth,
|
||||
out_height, out_width, k_depth, k_height, k_width, s_depth, s_height,
|
||||
s_width, p_depth, p_height, p_width, d_depth, d_height, d_width);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMaxPool3DGradWithArgmax, MaxPool3DGradWithArgmaxCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,45 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MAX_POOL3D_GRAD_WINTH_ARGMAX_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MAX_POOL3D_GRAD_WINTH_ARGMAX_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MaxPool3DGradWithArgmaxCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MaxPool3DGradWithArgmaxCpuKernel() = default;
|
||||
~MaxPool3DGradWithArgmaxCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MaxPool3DGradWithArgmaxParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T, typename S>
|
||||
uint32_t MaxPool3DGradWithArgmaxCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T, typename S>
|
||||
void MaxPool3DGradWithArgmaxSingleCompute(T *input_x, S *input_argmax, T *output_y, int64_t iD, int64_t iH,
|
||||
int64_t iW, int64_t oD, int64_t oH, int64_t oW, int64_t kD, int64_t kH,
|
||||
int64_t kW, int64_t sD, int64_t sH, int64_t sW, int64_t pD, int64_t pH,
|
||||
int64_t pW, int64_t dD, int64_t dH, int64_t dW);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,342 @@
|
|||
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "max_pool_3d_with_argmax.h"
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 2;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *kMaxPool3DWithArgmax = "MaxPool3DWithArgmax";
|
||||
|
||||
#define MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DTYPE, INTYPE, OUTTYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MaxPool3DWithArgmaxCompute<INTYPE, OUTTYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MaxPool3DWithArgmaxCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
std::vector<std::string> attr_names = {"ksize", "strides", "pads"};
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum, attr_names),
|
||||
"MaxPool3DWithArgmax check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MaxPool3DWithArgmaxParamCheck(ctx), "MaxPool3DWithArgmax check params failed.");
|
||||
auto in_data_type = ctx.Input(0)->GetDataType();
|
||||
auto out_data_type = ctx.Output(1)->GetDataType();
|
||||
std::string argmax_type =
|
||||
(ctx.GetAttr("argmax_type") == nullptr) ? "bitmask" : ctx.GetAttr("argmax_type")->GetString();
|
||||
if (argmax_type == "bitmask") {
|
||||
KERNEL_LOG_ERROR("Bitmask is not supported now.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
if (out_data_type == DT_INT32) {
|
||||
switch (in_data_type) {
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int32_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int32_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int32_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int32_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int32_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int32_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int32_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int32_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int32_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int32_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int32_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel input data type [%s] not support.",
|
||||
DTypeStr(in_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else if (out_data_type == DT_INT64) {
|
||||
switch (in_data_type) {
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT8, int8_t, int64_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT16, int16_t, int64_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT32, int32_t, int64_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_INT64, int64_t, int64_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT8, uint8_t, int64_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT16, uint16_t, int64_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT32, uint32_t, int64_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_UINT64, uint64_t, int64_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, int64_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_FLOAT, float, int64_t, ctx)
|
||||
MAX_POOL3D_WITH_ARGMAX_COMPUTE_CASE(DT_DOUBLE, double, int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MaxPool3DWithArgmax kernel input data type [%s] not support.",
|
||||
DTypeStr(in_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else {
|
||||
KERNEL_LOG_ERROR(
|
||||
"MaxPool3DWithArgmax kernel output_argmax data type [%s] not "
|
||||
"support.",
|
||||
DTypeStr(out_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxParamCheck(CpuKernelContext &ctx) {
|
||||
auto input_info = ctx.Input(0);
|
||||
auto output_y_info = ctx.Output(0);
|
||||
auto output_argmax_info = ctx.Output(1);
|
||||
DataType input_type = input_info->GetDataType();
|
||||
DataType output_y_type = output_y_info->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input_type == output_y_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input x [%s] need be same with "
|
||||
"output y [%s].",
|
||||
DTypeStr(input_type).c_str(), DTypeStr(output_y_type).c_str())
|
||||
DataType output_argmax_type = output_argmax_info->GetDataType();
|
||||
KERNEL_CHECK_FALSE((output_argmax_type == DT_INT32) || (output_argmax_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of output argmax:[%s] should be a int32 or int64. ",
|
||||
DTypeStr(output_argmax_type).c_str())
|
||||
|
||||
std::vector<int64_t> dim_vec = input_info->GetTensorShape()->GetDimSizes();
|
||||
int64_t dimsize = dim_vec.size();
|
||||
KERNEL_CHECK_FALSE(dimsize == 5, KERNEL_STATUS_PARAM_INVALID, "The dim of input:[%d] should be 5.", dimsize)
|
||||
|
||||
const size_t DIM_SIZE1 = 1;
|
||||
const size_t DIM_SIZE3 = 3;
|
||||
const size_t DIM_SIZE5 = 5;
|
||||
AttrValue *attr_ksize = ctx.GetAttr("ksize");
|
||||
std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
|
||||
KERNEL_CHECK_FALSE(ksizeList.size() == DIM_SIZE1 || ksizeList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The size of ksize:[%d] should be 1 or 3.", ksizeList.size())
|
||||
AttrValue *attr_strides = ctx.GetAttr("strides");
|
||||
std::vector<int64_t> stridesList = attr_strides->GetListInt();
|
||||
KERNEL_CHECK_FALSE(stridesList.size() == DIM_SIZE1 || stridesList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The size of strides:[%d] should be 1 or 3.", stridesList.size())
|
||||
AttrValue *attr_pads = ctx.GetAttr("pads");
|
||||
std::vector<int64_t> padsList = attr_pads->GetListInt();
|
||||
KERNEL_CHECK_FALSE(padsList.size() == DIM_SIZE1 || padsList.size() == DIM_SIZE3, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The size of pads:[%d] should be 1 or 3.", padsList.size())
|
||||
AttrValue *attr_dilation = ctx.GetAttr("dilation");
|
||||
std::vector<int64_t> initList = {1, 1, 1, 1, 1};
|
||||
std::vector<int64_t> dilationList = (attr_dilation == nullptr) ? initList : attr_dilation->GetListInt();
|
||||
KERNEL_CHECK_FALSE(
|
||||
dilationList.size() == DIM_SIZE1 || dilationList.size() == DIM_SIZE3 || dilationList.size() == DIM_SIZE5,
|
||||
KERNEL_STATUS_PARAM_INVALID, "The size of dilation:[%d] should be 1, 3 or 5.", dilationList.size())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"MaxPool3sWithArgmaxCpuKernel[%s], input x: size[%llu];"
|
||||
"output y: size[%llu], output argmax: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_info->GetDataSize(), output_y_info->GetDataSize(),
|
||||
output_argmax_info->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
void MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxSingleCompute(T *input, T *output_y, S *output_argmax, int64_t iD,
|
||||
int64_t iH, int64_t iW, int64_t oD, int64_t oH,
|
||||
int64_t oW, int64_t kD, int64_t kH, int64_t kW,
|
||||
int64_t sD, int64_t sH, int64_t sW, int64_t pD,
|
||||
int64_t pH, int64_t pW, int64_t dD, int64_t dH,
|
||||
int64_t dW) {
|
||||
int64_t i, j, ti;
|
||||
T *ip = input;
|
||||
for (ti = 0; ti < oD; ti++) {
|
||||
for (i = 0; i < oH; i++) {
|
||||
for (j = 0; j < oW; j++) {
|
||||
int64_t start_t = ti * sD - pD;
|
||||
int64_t start_h = i * sH - pH;
|
||||
int64_t start_w = j * sW - pW;
|
||||
|
||||
int64_t end_t = std::min(start_t + (kD - 1) * dD + 1, iD);
|
||||
int64_t end_h = std::min(start_h + (kH - 1) * dH + 1, iH);
|
||||
int64_t end_w = std::min(start_w + (kW - 1) * dW + 1, iW);
|
||||
|
||||
while (start_t < 0) {
|
||||
start_t += dD;
|
||||
}
|
||||
while (start_h < 0) {
|
||||
start_h += dH;
|
||||
}
|
||||
while (start_w < 0) {
|
||||
start_w += dW;
|
||||
}
|
||||
|
||||
T *op = output_y + ti * oW * oH + i * oW + j;
|
||||
S *indzp = output_argmax + ti * oW * oH + i * oW + j;
|
||||
|
||||
S maxindex = start_t * iH * iW + start_h * iW + start_w;
|
||||
T maxval = -std::numeric_limits<T>::infinity();
|
||||
|
||||
for (int64_t z = start_t; z < end_t; z += dD) {
|
||||
for (int64_t y = start_h; y < end_h; y += dH) {
|
||||
for (int64_t x = start_w; x < end_w; x += dW) {
|
||||
S index = z * iH * iW + y * iW + x;
|
||||
T val = ip[index];
|
||||
if ((val > maxval) || std::isnan(double(val))) {
|
||||
maxval = (T)val;
|
||||
maxindex = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// store location of max
|
||||
*indzp = maxindex;
|
||||
|
||||
/* set output to local max */
|
||||
*op = maxval;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
uint32_t MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxCompute(CpuKernelContext &ctx) {
|
||||
auto input_info = ctx.Input(0);
|
||||
auto output_y_info = ctx.Output(0);
|
||||
auto output_argmax_info = ctx.Output(1);
|
||||
auto input_x = reinterpret_cast<T *>(input_info->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(output_y_info->GetData());
|
||||
auto output_argmax = reinterpret_cast<S *>(output_argmax_info->GetData());
|
||||
AttrValue *attr_ksize = ctx.GetAttr("ksize");
|
||||
std::vector<int64_t> ksizeList = attr_ksize->GetListInt();
|
||||
AttrValue *attr_strides = ctx.GetAttr("strides");
|
||||
std::vector<int64_t> stridesList = attr_strides->GetListInt();
|
||||
AttrValue *attr_pads = ctx.GetAttr("pads");
|
||||
std::vector<int64_t> padsList = attr_pads->GetListInt();
|
||||
AttrValue *attr_dilation = ctx.GetAttr("dilation");
|
||||
std::vector<int64_t> initList = {1, 1, 1, 1, 1};
|
||||
std::vector<int64_t> dilationList = (attr_dilation == nullptr) ? initList : attr_dilation->GetListInt();
|
||||
|
||||
auto input_shape_vec = input_info->GetTensorShape()->GetDimSizes();
|
||||
auto output_shape_vec = output_y_info->GetTensorShape()->GetDimSizes();
|
||||
const int64_t in_width = input_shape_vec[4];
|
||||
const int64_t in_height = input_shape_vec[3];
|
||||
const int64_t in_depth = input_shape_vec[2];
|
||||
const int64_t in_channel = input_shape_vec[1];
|
||||
const int64_t in_batch = input_shape_vec[0];
|
||||
const int64_t out_width = output_shape_vec[4];
|
||||
const int64_t out_height = output_shape_vec[3];
|
||||
const int64_t out_depth = output_shape_vec[2];
|
||||
const size_t DIM_SIZE1 = 1;
|
||||
const size_t DIM_SIZE5 = 5;
|
||||
std::vector<int64_t> ksizeTempList;
|
||||
if (ksizeList.size() == DIM_SIZE1) {
|
||||
ksizeTempList.push_back(ksizeList[0]);
|
||||
ksizeTempList.push_back(ksizeList[0]);
|
||||
ksizeTempList.push_back(ksizeList[0]);
|
||||
} else {
|
||||
ksizeTempList.push_back(ksizeList[0]);
|
||||
ksizeTempList.push_back(ksizeList[1]);
|
||||
ksizeTempList.push_back(ksizeList[2]);
|
||||
}
|
||||
std::vector<int64_t> stridesTempList;
|
||||
if (stridesList.size() == DIM_SIZE1) {
|
||||
stridesTempList.push_back(stridesList[0]);
|
||||
stridesTempList.push_back(stridesList[0]);
|
||||
stridesTempList.push_back(stridesList[0]);
|
||||
} else {
|
||||
stridesTempList.push_back(stridesList[0]);
|
||||
stridesTempList.push_back(stridesList[1]);
|
||||
stridesTempList.push_back(stridesList[2]);
|
||||
}
|
||||
std::vector<int64_t> padsTempList;
|
||||
if (padsList.size() == DIM_SIZE1) {
|
||||
padsTempList.push_back(padsList[0]);
|
||||
padsTempList.push_back(padsList[0]);
|
||||
padsTempList.push_back(padsList[0]);
|
||||
} else {
|
||||
padsTempList.push_back(padsList[0]);
|
||||
padsTempList.push_back(padsList[1]);
|
||||
padsTempList.push_back(padsList[2]);
|
||||
}
|
||||
std::vector<int64_t> dilationTempList;
|
||||
if (dilationList.size() == DIM_SIZE1) {
|
||||
dilationTempList.push_back(dilationList[0]);
|
||||
dilationTempList.push_back(dilationList[0]);
|
||||
dilationTempList.push_back(dilationList[0]);
|
||||
} else if (dilationList.size() == DIM_SIZE5) {
|
||||
dilationTempList.push_back(dilationList[2]);
|
||||
dilationTempList.push_back(dilationList[3]);
|
||||
dilationTempList.push_back(dilationList[4]);
|
||||
} else {
|
||||
dilationTempList.push_back(dilationList[0]);
|
||||
dilationTempList.push_back(dilationList[1]);
|
||||
dilationTempList.push_back(dilationList[2]);
|
||||
}
|
||||
const int64_t k_width = ksizeTempList[2];
|
||||
const int64_t k_height = ksizeTempList[1];
|
||||
const int64_t k_depth = ksizeTempList[0];
|
||||
const int64_t s_width = stridesTempList[2];
|
||||
const int64_t s_height = stridesTempList[1];
|
||||
const int64_t s_depth = stridesTempList[0];
|
||||
const int64_t p_width = padsTempList[2];
|
||||
const int64_t p_height = padsTempList[1];
|
||||
const int64_t p_depth = padsTempList[0];
|
||||
const int64_t d_width = dilationTempList[2];
|
||||
const int64_t d_height = dilationTempList[1];
|
||||
const int64_t d_depth = dilationTempList[0];
|
||||
KERNEL_CHECK_FALSE(k_width / 2 >= p_width && k_height / 2 >= p_height && k_depth / 2 >= p_depth,
|
||||
KERNEL_STATUS_PARAM_INVALID, "pads should be smaller than or equal to half of kernel size.");
|
||||
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
const int64_t batch = in_batch * in_channel;
|
||||
const int64_t in_stride = in_width * in_height * in_depth;
|
||||
const int64_t out_stride = out_width * out_height * out_depth;
|
||||
const int64_t kParallelDataNum = 16 * in_width * in_height * in_depth;
|
||||
const int64_t kParallelDataNumMid = 72 * in_width * in_height * in_depth;
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
auto sharder_max_pool3d_with_argmax = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
MaxPool3DWithArgmaxSingleCompute(input_x + i * in_stride, output_y + i * out_stride,
|
||||
output_argmax + i * out_stride, in_depth, in_height, in_width, out_depth,
|
||||
out_height, out_width, k_depth, k_height, k_width, s_depth, s_height, s_width,
|
||||
p_depth, p_height, p_width, d_depth, d_height, d_width);
|
||||
}
|
||||
};
|
||||
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_max_pool3d_with_argmax),
|
||||
"MaxPool3DWithArgmax Compute failed.");
|
||||
|
||||
} else {
|
||||
for (int64_t i = 0; i < batch; i++) {
|
||||
MaxPool3DWithArgmaxSingleCompute(input_x + i * in_stride, output_y + i * out_stride,
|
||||
output_argmax + i * out_stride, in_depth, in_height, in_width, out_depth,
|
||||
out_height, out_width, k_depth, k_height, k_width, s_depth, s_height, s_width,
|
||||
p_depth, p_height, p_width, d_depth, d_height, d_width);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMaxPool3DWithArgmax, MaxPool3DWithArgmaxCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,46 @@
|
|||
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MAX_POOL3D_WINTH_ARGMAX_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MAX_POOL3D_WINTH_ARGMAX_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MaxPool3DWithArgmaxCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MaxPool3DWithArgmaxCpuKernel() = default;
|
||||
~MaxPool3DWithArgmaxCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MaxPool3DWithArgmaxParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T, typename S>
|
||||
uint32_t MaxPool3DWithArgmaxCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T, typename S>
|
||||
void MaxPool3DWithArgmaxSingleCompute(T *input, T *output_y, S *output_argmax, int64_t iD, int64_t iH, int64_t iW,
|
||||
int64_t oD, int64_t oH, int64_t oW, int64_t kD, int64_t kH, int64_t kW,
|
||||
int64_t sD, int64_t sH, int64_t sW, int64_t pD, int64_t pH, int64_t pW,
|
||||
int64_t dD, int64_t dH, int64_t dW);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,235 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "max_unpool_2d.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
constexpr int64_t kParallelDataNums = 1024;
|
||||
const char *kMaxUnpool2D = "MaxUnpool2D";
|
||||
|
||||
#define SWITCH_PARALLEL(SHARD, end_num, ctx) \
|
||||
if (end_num <= kParallelDataNums) { \
|
||||
for (size_t i = 0; i < size_t(end_num); i++) { \
|
||||
SHARD(i, i + 1); \
|
||||
} \
|
||||
} else { \
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), "MaxUnpool2D #SHARD Compute failed."); \
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename DATA_T>
|
||||
uint32_t MaxUnpool2DCpuKernel::MaxUnpool2D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
|
||||
// Compute by indices_type
|
||||
switch (indices_type) {
|
||||
case DT_INT32:
|
||||
return MaxUnpool2DCompute<DATA_T, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return MaxUnpool2DCompute<DATA_T, int64_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t MaxUnpool2DCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool2D check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MaxUnpool2DCheck(ctx), "MaxUnpool2D check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
auto indices_type = ctx.Input(1)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_INT8:
|
||||
return MaxUnpool2D_COMPUTE_CASE<int8_t>(ctx, indices_type);
|
||||
case DT_INT16:
|
||||
return MaxUnpool2D_COMPUTE_CASE<int16_t>(ctx, indices_type);
|
||||
case DT_INT32:
|
||||
return MaxUnpool2D_COMPUTE_CASE<int32_t>(ctx, indices_type);
|
||||
case DT_INT64:
|
||||
return MaxUnpool2D_COMPUTE_CASE<int64_t>(ctx, indices_type);
|
||||
case DT_UINT8:
|
||||
return MaxUnpool2D_COMPUTE_CASE<uint8_t>(ctx, indices_type);
|
||||
case DT_UINT16:
|
||||
return MaxUnpool2D_COMPUTE_CASE<uint16_t>(ctx, indices_type);
|
||||
case DT_UINT32:
|
||||
return MaxUnpool2D_COMPUTE_CASE<uint32_t>(ctx, indices_type);
|
||||
case DT_UINT64:
|
||||
return MaxUnpool2D_COMPUTE_CASE<uint64_t>(ctx, indices_type);
|
||||
case DT_FLOAT16:
|
||||
return MaxUnpool2D_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
|
||||
case DT_FLOAT:
|
||||
return MaxUnpool2D_COMPUTE_CASE<float>(ctx, indices_type);
|
||||
case DT_DOUBLE:
|
||||
return MaxUnpool2D_COMPUTE_CASE<double>(ctx, indices_type);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MaxUnpool2D kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MaxUnpool2DCpuKernel::MaxUnpool2DCheck(CpuKernelContext &ctx) {
|
||||
DataType input0Type = ctx.Input(0)->GetDataType();
|
||||
DataType outputType = ctx.Output(0)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of output [%d] need be same with "
|
||||
"input0 [%d].",
|
||||
outputType, input0Type)
|
||||
|
||||
KERNEL_LOG_INFO(
|
||||
"MaxUnpool2DCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Output(0)->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename DATA_T, typename INDICES_T>
|
||||
uint32_t MaxUnpool2DCpuKernel::MaxUnpool2DCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input = ctx.Input(0);
|
||||
Tensor *indices = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
std::string dataFormat = "NCHW";
|
||||
if (ctx.GetAttr("data_format") != nullptr) {
|
||||
dataFormat = ctx.GetAttr("data_format")->GetString();
|
||||
}
|
||||
int32_t NIndex, CIndex, HIndex, WIndex;
|
||||
bool error = false;
|
||||
if (dataFormat == "NHWC") {
|
||||
NIndex = 0;
|
||||
CIndex = 3;
|
||||
HIndex = 1;
|
||||
WIndex = 2;
|
||||
auto inputShape = input->GetTensorShape();
|
||||
int64_t numBatch = inputShape->GetDimSize(NIndex);
|
||||
int64_t inputHeight = inputShape->GetDimSize(HIndex);
|
||||
int64_t inputWidth = inputShape->GetDimSize(WIndex);
|
||||
int64_t numChannels = inputShape->GetDimSize(CIndex);
|
||||
auto output_shape = output->GetTensorShape();
|
||||
int64_t oheight = output_shape->GetDimSize(HIndex);
|
||||
int64_t owidth = output_shape->GetDimSize(WIndex);
|
||||
|
||||
auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
|
||||
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
|
||||
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
|
||||
for (int s = 0; s < numBatch * oheight * owidth * numChannels; s++) {
|
||||
rawOutput[s] = (DATA_T)0;
|
||||
}
|
||||
|
||||
auto shard = [&](int64_t start, int64_t end) {
|
||||
for (int64_t n = start; n < end; n++) {
|
||||
int64_t nOutputOffset = n * numChannels * owidth * oheight;
|
||||
int64_t nInputOffset = n * numChannels * inputWidth * inputHeight;
|
||||
DATA_T *output_p_k = rawOutput + nOutputOffset;
|
||||
DATA_T *input_p_k = rawInput + nInputOffset;
|
||||
INDICES_T *ind_p_k = rawIndices + nInputOffset;
|
||||
|
||||
int64_t maxp;
|
||||
for (int64_t k = 0; k < numChannels; k++) {
|
||||
for (int64_t i = 0; i < inputHeight; i++) {
|
||||
for (int64_t j = 0; j < inputWidth; j++) {
|
||||
maxp = ind_p_k[i * inputWidth * numChannels + j * numChannels + k];
|
||||
if (maxp < 0 || maxp >= owidth * oheight) {
|
||||
error = true;
|
||||
KERNEL_LOG_ERROR(
|
||||
"MaxUnpool2D: output_size H_out * W_out "
|
||||
"should be bigger than argmax, now H_out is [%ld], "
|
||||
"and W_out is [%ld], but one of the values in argmax is "
|
||||
"[%ld].",
|
||||
oheight, owidth, maxp);
|
||||
} else {
|
||||
output_p_k[maxp * numChannels + k] = input_p_k[i * inputWidth * numChannels + j * numChannels + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard, numBatch, ctx);
|
||||
} else {
|
||||
NIndex = 0;
|
||||
CIndex = 1;
|
||||
HIndex = 2;
|
||||
WIndex = 3;
|
||||
auto inputShape = input->GetTensorShape();
|
||||
int64_t numBatch = inputShape->GetDimSize(NIndex);
|
||||
int64_t inputHeight = inputShape->GetDimSize(HIndex);
|
||||
int64_t inputWidth = inputShape->GetDimSize(WIndex);
|
||||
int64_t numChannels = inputShape->GetDimSize(CIndex);
|
||||
|
||||
auto output_shape = output->GetTensorShape();
|
||||
int64_t oheight = output_shape->GetDimSize(HIndex);
|
||||
int64_t owidth = output_shape->GetDimSize(WIndex);
|
||||
auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
|
||||
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
|
||||
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
|
||||
|
||||
for (int s = 0; s < numBatch * oheight * owidth * numChannels; s++) {
|
||||
rawOutput[s] = (DATA_T)0;
|
||||
}
|
||||
auto shard = [&](int64_t start, int64_t end) {
|
||||
for (int64_t n = start; n < end; n++) {
|
||||
int64_t nOutputOffset = n * numChannels * owidth * oheight;
|
||||
int64_t nInputOffset = n * numChannels * inputWidth * inputHeight;
|
||||
int64_t k = 0;
|
||||
for (k = 0; k < numChannels; k++) {
|
||||
int64_t finalOutputOffset = nOutputOffset + k * owidth * oheight;
|
||||
int64_t finalInputOffset = nInputOffset + k * inputWidth * inputHeight;
|
||||
DATA_T *output_p_k = rawOutput + finalOutputOffset;
|
||||
DATA_T *input_p_k = rawInput + finalInputOffset;
|
||||
INDICES_T *ind_p_k = rawIndices + finalInputOffset;
|
||||
|
||||
int64_t maxp;
|
||||
for (int64_t i = 0; i < inputHeight; i++) {
|
||||
for (int64_t j = 0; j < inputWidth; j++) {
|
||||
maxp = ind_p_k[i * inputWidth + j];
|
||||
if (maxp < 0 || maxp >= owidth * oheight) {
|
||||
error = true;
|
||||
KERNEL_LOG_ERROR(
|
||||
"MaxUnpool2D: output_size H_out * W_out "
|
||||
"should be bigger than argmax, now H_out is [%ld], "
|
||||
"and W_out is [%ld], but one of the values in argmax is "
|
||||
"[%ld].",
|
||||
oheight, owidth, maxp);
|
||||
} else {
|
||||
output_p_k[maxp] = input_p_k[i * inputWidth + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard, numBatch, ctx);
|
||||
}
|
||||
|
||||
if (error == true) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMaxUnpool2D, MaxUnpool2DCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
namespace aicpu {
|
||||
class MaxUnpool2DCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MaxUnpool2DCpuKernel() = default;
|
||||
~MaxUnpool2DCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
static uint32_t MaxUnpool2DCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
static uint32_t MaxUnpool2D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
|
||||
|
||||
template <typename T, typename S>
|
||||
static uint32_t MaxUnpool2DCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_H_
|
|
@ -0,0 +1,247 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "max_unpool_2d_grad.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 3;
|
||||
constexpr int64_t kParallelDataNums = 1024;
|
||||
const char *kMaxUnpool2DGrad = "MaxUnpool2DGrad";
|
||||
|
||||
#define SWITCH_PARALLEL(SHARD, end_num, ctx) \
|
||||
if (end_num <= kParallelDataNums) { \
|
||||
for (size_t i = 0; i < size_t(end_num); i++) { \
|
||||
SHARD(i, i + 1); \
|
||||
} \
|
||||
} else { \
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), \
|
||||
"MaxUnpool2DGrad #SHARD Compute failed."); \
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename DATA_T>
|
||||
uint32_t MaxUnpool2DGradCpuKernel::MaxUnpool2DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
|
||||
// Compute by indices_type
|
||||
switch (indices_type) {
|
||||
case DT_INT32:
|
||||
return MaxUnpool2DGradCompute<DATA_T, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return MaxUnpool2DGradCompute<DATA_T, int64_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t MaxUnpool2DGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool2DGrad check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MaxUnpool2DGradCheck(ctx), "MaxUnpool2DGrad check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
auto indices_type = ctx.Input(2)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_INT8:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<int8_t>(ctx, indices_type);
|
||||
case DT_INT16:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<int16_t>(ctx, indices_type);
|
||||
case DT_INT32:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<int32_t>(ctx, indices_type);
|
||||
case DT_INT64:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<int64_t>(ctx, indices_type);
|
||||
case DT_UINT8:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<uint8_t>(ctx, indices_type);
|
||||
case DT_UINT16:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<uint16_t>(ctx, indices_type);
|
||||
case DT_UINT32:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<uint32_t>(ctx, indices_type);
|
||||
case DT_UINT64:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<uint64_t>(ctx, indices_type);
|
||||
case DT_FLOAT16:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
|
||||
case DT_FLOAT:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<float>(ctx, indices_type);
|
||||
case DT_DOUBLE:
|
||||
return MaxUnpool2DGrad_COMPUTE_CASE<double>(ctx, indices_type);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MaxUnpool2DGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MaxUnpool2DGradCpuKernel::MaxUnpool2DGradCheck(CpuKernelContext &ctx) {
|
||||
DataType input0Type = ctx.Input(0)->GetDataType();
|
||||
DataType input1Type = ctx.Input(1)->GetDataType();
|
||||
DataType outputType = ctx.Output(0)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0Type == input1Type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input1Type [%d] need be same with "
|
||||
"input0 [%d].",
|
||||
input1Type, input0Type)
|
||||
|
||||
KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of output [%d] need be same with "
|
||||
"input0 [%d].",
|
||||
outputType, input0Type)
|
||||
|
||||
auto Input0_size = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
auto Input2_size = ctx.Input(2)->GetTensorShape()->GetDimSizes();
|
||||
|
||||
KERNEL_CHECK_FALSE((Input0_size == Input2_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data size of x [%d] need be same with "
|
||||
"input argmax [%d].",
|
||||
Input0_size, Input2_size)
|
||||
|
||||
KERNEL_LOG_INFO(
|
||||
"MaxUnpool2DGradCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], input2: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Input(2)->GetDataSize(),
|
||||
ctx.Output(0)->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename DATA_T, typename INDICES_T>
|
||||
uint32_t MaxUnpool2DGradCpuKernel::MaxUnpool2DGradCompute(CpuKernelContext &ctx) {
|
||||
Tensor *grads = ctx.Input(1);
|
||||
Tensor *indices = ctx.Input(2);
|
||||
Tensor *output = ctx.Output(0);
|
||||
std::string dataFormat = "NCHW";
|
||||
if (ctx.GetAttr("data_format") != nullptr) {
|
||||
dataFormat = ctx.GetAttr("data_format")->GetString();
|
||||
}
|
||||
int32_t NIndex, CIndex, HIndex, WIndex;
|
||||
bool error = false;
|
||||
if (dataFormat == "NHWC") {
|
||||
NIndex = 0;
|
||||
CIndex = 3;
|
||||
HIndex = 1;
|
||||
WIndex = 2;
|
||||
auto grads_out_shape = grads->GetTensorShape();
|
||||
int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
|
||||
int64_t oheight = grads_out_shape->GetDimSize(HIndex);
|
||||
int64_t owidth = grads_out_shape->GetDimSize(WIndex);
|
||||
int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
|
||||
auto output_shape = output->GetTensorShape();
|
||||
int64_t iheight = output_shape->GetDimSize(HIndex);
|
||||
int64_t iwidth = output_shape->GetDimSize(WIndex);
|
||||
auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
|
||||
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
|
||||
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
|
||||
for (int s = 0; s < numBatch * iheight * iwidth * numChannels; s++) {
|
||||
rawOutput[s] = (DATA_T)0;
|
||||
}
|
||||
|
||||
auto shard = [&](int64_t start, int64_t end) {
|
||||
for (int64_t n = start; n < end; n++) {
|
||||
int64_t nOutputOffset = n * numChannels * iwidth * iheight;
|
||||
int64_t nGradsOffset = n * numChannels * owidth * oheight;
|
||||
DATA_T *output_p_k = rawOutput + nOutputOffset;
|
||||
DATA_T *grads_p_k = rawGrads + nGradsOffset;
|
||||
INDICES_T *ind_p_k = rawIndices + nOutputOffset;
|
||||
int64_t maxp;
|
||||
for (int64_t k = 0; k < numChannels; k++) {
|
||||
for (int64_t i = 0; i < iheight; i++) {
|
||||
for (int64_t j = 0; j < iwidth; j++) {
|
||||
maxp = ind_p_k[i * iwidth * numChannels + j * numChannels + k];
|
||||
if (maxp < 0 || maxp >= owidth * oheight) {
|
||||
error = true;
|
||||
KERNEL_LOG_ERROR(
|
||||
"MaxUnpool2DGrad: output_size H_out * W_out "
|
||||
"should be bigger than argmax, now H_out is [%ld], "
|
||||
"and W_out is [%ld], but one of the values in argmax is "
|
||||
"[%ld].",
|
||||
oheight, owidth, maxp);
|
||||
} else {
|
||||
output_p_k[i * iwidth * numChannels + j * numChannels + k] = grads_p_k[maxp * numChannels + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard, numBatch, ctx);
|
||||
} else {
|
||||
NIndex = 0;
|
||||
CIndex = 1;
|
||||
HIndex = 2;
|
||||
WIndex = 3;
|
||||
auto grads_out_shape = grads->GetTensorShape();
|
||||
int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
|
||||
int64_t oheight = grads_out_shape->GetDimSize(HIndex);
|
||||
int64_t owidth = grads_out_shape->GetDimSize(WIndex);
|
||||
int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
|
||||
auto output_shape = output->GetTensorShape();
|
||||
int64_t iheight = output_shape->GetDimSize(HIndex);
|
||||
int64_t iwidth = output_shape->GetDimSize(WIndex);
|
||||
auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
|
||||
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
|
||||
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
|
||||
for (int s = 0; s < numBatch * iheight * iwidth * numChannels; s++) {
|
||||
rawOutput[s] = (DATA_T)0;
|
||||
}
|
||||
|
||||
auto shard = [&](int64_t start, int64_t end) {
|
||||
for (int64_t n = start; n < end; n++) {
|
||||
int64_t nOutputOffset = n * numChannels * iwidth * iheight;
|
||||
int64_t nGradsOffset = n * numChannels * owidth * oheight;
|
||||
int64_t k = 0;
|
||||
for (k = 0; k < numChannels; k++) {
|
||||
int64_t finalOutputOffset = nOutputOffset + k * iwidth * iheight;
|
||||
int64_t finalGradsOffset = nGradsOffset + k * owidth * oheight;
|
||||
DATA_T *output_p_k = rawOutput + finalOutputOffset;
|
||||
DATA_T *grads_p_k = rawGrads + finalGradsOffset;
|
||||
INDICES_T *ind_p_k = rawIndices + finalOutputOffset;
|
||||
int64_t maxp;
|
||||
for (int64_t i = 0; i < iheight; i++) {
|
||||
for (int64_t j = 0; j < iwidth; j++) {
|
||||
maxp = ind_p_k[i * iwidth + j];
|
||||
if (maxp < 0 || maxp >= owidth * oheight) {
|
||||
error = true;
|
||||
KERNEL_LOG_ERROR(
|
||||
"MaxUnpool2DGrad: output_size H_out * W_out "
|
||||
"should be bigger than argmax, now H_out is [%ld], "
|
||||
"and W_out is [%ld], but one of the values in argmax is "
|
||||
"[%ld].",
|
||||
oheight, owidth, maxp);
|
||||
} else {
|
||||
output_p_k[i * iwidth + j] = grads_p_k[maxp];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard, numBatch, ctx);
|
||||
}
|
||||
|
||||
if (error == true) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMaxUnpool2DGrad, MaxUnpool2DGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_GRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
namespace aicpu {
|
||||
class MaxUnpool2DGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MaxUnpool2DGradCpuKernel() = default;
|
||||
~MaxUnpool2DGradCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
static uint32_t MaxUnpool2DGradCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
static uint32_t MaxUnpool2DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
|
||||
|
||||
template <typename T, typename S>
|
||||
static uint32_t MaxUnpool2DGradCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL2D_GRAD_H_
|
|
@ -0,0 +1,247 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "max_unpool_3d.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
constexpr int64_t kParallelDataNums = 1024;
|
||||
const char *kMaxUnpool3D = "MaxUnpool3D";
|
||||
|
||||
#define SWITCH_PARALLEL(SHARD, end_num, ctx) \
|
||||
if (end_num <= kParallelDataNums) { \
|
||||
for (size_t i = 0; i < size_t(end_num); i++) { \
|
||||
SHARD(i, i + 1); \
|
||||
} \
|
||||
} else { \
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), "MaxUnpool3D #SHARD Compute failed."); \
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename DATA_T>
|
||||
uint32_t MaxUnpool3DCpuKernel::MaxUnpool3D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
|
||||
// Compute by indices_type
|
||||
switch (indices_type) {
|
||||
case DT_INT32:
|
||||
return MaxUnpool3DCompute<DATA_T, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return MaxUnpool3DCompute<DATA_T, int64_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t MaxUnpool3DCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool3D check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MaxUnpool3DCheck(ctx), "MaxUnpool3D check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
auto indices_type = ctx.Input(1)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_INT8:
|
||||
return MaxUnpool3D_COMPUTE_CASE<int8_t>(ctx, indices_type);
|
||||
case DT_INT16:
|
||||
return MaxUnpool3D_COMPUTE_CASE<int16_t>(ctx, indices_type);
|
||||
case DT_INT32:
|
||||
return MaxUnpool3D_COMPUTE_CASE<int32_t>(ctx, indices_type);
|
||||
case DT_INT64:
|
||||
return MaxUnpool3D_COMPUTE_CASE<int64_t>(ctx, indices_type);
|
||||
case DT_UINT8:
|
||||
return MaxUnpool3D_COMPUTE_CASE<uint8_t>(ctx, indices_type);
|
||||
case DT_UINT16:
|
||||
return MaxUnpool3D_COMPUTE_CASE<uint16_t>(ctx, indices_type);
|
||||
case DT_UINT32:
|
||||
return MaxUnpool3D_COMPUTE_CASE<uint32_t>(ctx, indices_type);
|
||||
case DT_UINT64:
|
||||
return MaxUnpool3D_COMPUTE_CASE<uint64_t>(ctx, indices_type);
|
||||
case DT_FLOAT16:
|
||||
return MaxUnpool3D_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
|
||||
case DT_FLOAT:
|
||||
return MaxUnpool3D_COMPUTE_CASE<float>(ctx, indices_type);
|
||||
case DT_DOUBLE:
|
||||
return MaxUnpool3D_COMPUTE_CASE<double>(ctx, indices_type);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MaxUnpool3D kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MaxUnpool3DCpuKernel::MaxUnpool3DCheck(CpuKernelContext &ctx) {
|
||||
DataType input0Type = ctx.Input(0)->GetDataType();
|
||||
DataType outputType = ctx.Output(0)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of output [%d] need be same with "
|
||||
"input0 [%d].",
|
||||
outputType, input0Type)
|
||||
|
||||
KERNEL_LOG_INFO(
|
||||
"MaxUnpool3DCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Output(0)->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename DATA_T, typename INDICES_T>
|
||||
uint32_t MaxUnpool3DCpuKernel::MaxUnpool3DCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input = ctx.Input(0);
|
||||
Tensor *indices = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
std::string dataFormat = "NCDHW";
|
||||
if (ctx.GetAttr("data_format") != nullptr) {
|
||||
dataFormat = ctx.GetAttr("data_format")->GetString();
|
||||
}
|
||||
int32_t NIndex, CIndex, DIndex, HIndex, WIndex;
|
||||
bool error = false;
|
||||
if (dataFormat == "NDHWC") {
|
||||
NIndex = 0;
|
||||
CIndex = 4;
|
||||
DIndex = 1;
|
||||
HIndex = 2;
|
||||
WIndex = 3;
|
||||
auto input_shape = input->GetTensorShape();
|
||||
int64_t numBatch = input_shape->GetDimSize(NIndex);
|
||||
int64_t inputDepth = input_shape->GetDimSize(DIndex);
|
||||
int64_t inputHeight = input_shape->GetDimSize(HIndex);
|
||||
int64_t inputWidth = input_shape->GetDimSize(WIndex);
|
||||
int64_t numChannels = input_shape->GetDimSize(CIndex);
|
||||
|
||||
auto output_shape = output->GetTensorShape();
|
||||
int64_t odepth = output_shape->GetDimSize(DIndex);
|
||||
int64_t oheight = output_shape->GetDimSize(HIndex);
|
||||
int64_t owidth = output_shape->GetDimSize(WIndex);
|
||||
|
||||
auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
|
||||
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
|
||||
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
|
||||
for (int s = 0; s < numBatch * odepth * oheight * owidth * numChannels; s++) {
|
||||
rawOutput[s] = (DATA_T)0;
|
||||
}
|
||||
|
||||
auto shard = [&](int64_t start, int64_t end) {
|
||||
for (int64_t n = start; n < end; n++) {
|
||||
int64_t nOutputOffset = n * numChannels * odepth * owidth * oheight;
|
||||
int64_t nInputOffset = n * numChannels * inputDepth * inputWidth * inputHeight;
|
||||
DATA_T *output_p_k = rawOutput + nOutputOffset;
|
||||
DATA_T *input_p_k = rawInput + nInputOffset;
|
||||
INDICES_T *ind_p_k = rawIndices + nInputOffset;
|
||||
|
||||
int64_t maxp;
|
||||
for (int64_t k = 0; k < numChannels; k++) {
|
||||
for (int64_t t = 0; t < inputDepth; t++) {
|
||||
for (int64_t i = 0; i < inputHeight; i++) {
|
||||
for (int64_t j = 0; j < inputWidth; j++) {
|
||||
maxp = ind_p_k[t * inputHeight * inputWidth * numChannels + i * inputWidth * numChannels +
|
||||
j * numChannels + k];
|
||||
if (maxp < 0 || maxp >= odepth * owidth * oheight) {
|
||||
error = true;
|
||||
KERNEL_LOG_ERROR(
|
||||
"MaxUnpool3D: output_size D_out * H_out * W_out "
|
||||
"should be bigger than argmax, now D_out is [%ld], H_out "
|
||||
"is [%ld], and W_out is [%ld], but one of the values in "
|
||||
"argmax is [%ld].",
|
||||
odepth, oheight, owidth, maxp);
|
||||
} else {
|
||||
output_p_k[maxp * numChannels + k] = input_p_k[t * inputHeight * inputWidth * numChannels +
|
||||
i * inputWidth * numChannels + j * numChannels + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard, numBatch, ctx);
|
||||
} else {
|
||||
NIndex = 0;
|
||||
CIndex = 1;
|
||||
DIndex = 2;
|
||||
HIndex = 3;
|
||||
WIndex = 4;
|
||||
|
||||
auto input_shape = input->GetTensorShape();
|
||||
int64_t numBatch = input_shape->GetDimSize(NIndex);
|
||||
int64_t inputDepth = input_shape->GetDimSize(DIndex);
|
||||
int64_t inputHeight = input_shape->GetDimSize(HIndex);
|
||||
int64_t inputWidth = input_shape->GetDimSize(WIndex);
|
||||
int64_t numChannels = input_shape->GetDimSize(CIndex);
|
||||
auto output_shape = output->GetTensorShape();
|
||||
int64_t odepth = output_shape->GetDimSize(DIndex);
|
||||
int64_t oheight = output_shape->GetDimSize(HIndex);
|
||||
int64_t owidth = output_shape->GetDimSize(WIndex);
|
||||
auto *rawInput = reinterpret_cast<DATA_T *>(input->GetData());
|
||||
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
|
||||
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
|
||||
for (int s = 0; s < numBatch * odepth * oheight * owidth * numChannels; s++) {
|
||||
rawOutput[s] = (DATA_T)0;
|
||||
}
|
||||
|
||||
auto shard = [&](int64_t start, int64_t end) {
|
||||
for (int64_t n = start; n < end; n++) {
|
||||
int64_t nOutputOffset = n * numChannels * odepth * owidth * oheight;
|
||||
int64_t nInputOffset = n * numChannels * inputDepth * inputWidth * inputHeight;
|
||||
int64_t k = 0;
|
||||
for (k = 0; k < numChannels; k++) {
|
||||
int64_t finalOutputOffset = nOutputOffset + k * odepth * owidth * oheight;
|
||||
int64_t finalInputOffset = nInputOffset + k * inputDepth * inputWidth * inputHeight;
|
||||
DATA_T *output_p_k = rawOutput + finalOutputOffset;
|
||||
DATA_T *input_p_k = rawInput + finalInputOffset;
|
||||
INDICES_T *ind_p_k = rawIndices + finalInputOffset;
|
||||
int64_t maxp;
|
||||
for (int64_t t = 0; t < inputDepth; t++) {
|
||||
for (int64_t i = 0; i < inputHeight; i++) {
|
||||
for (int64_t j = 0; j < inputWidth; j++) {
|
||||
maxp = ind_p_k[t * inputHeight * inputWidth + i * inputWidth + j];
|
||||
if (maxp < 0 || maxp >= odepth * owidth * oheight) {
|
||||
error = true;
|
||||
KERNEL_LOG_ERROR(
|
||||
"MaxUnpool3D: output_size D_out * H_out * W_out "
|
||||
"should be bigger than argmax, now D_out is [%ld], H_out "
|
||||
"is [%ld], and W_out is [%ld], but one of the values in "
|
||||
"argmax is [%ld].",
|
||||
odepth, oheight, owidth, maxp);
|
||||
} else {
|
||||
output_p_k[maxp] = input_p_k[t * inputHeight * inputWidth + i * inputWidth + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard, numBatch, ctx);
|
||||
}
|
||||
if (error == true) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMaxUnpool3D, MaxUnpool3DCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
namespace aicpu {
|
||||
class MaxUnpool3DCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MaxUnpool3DCpuKernel() = default;
|
||||
~MaxUnpool3DCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
static uint32_t MaxUnpool3DCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
static uint32_t MaxUnpool3D_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
|
||||
|
||||
template <typename T, typename S>
|
||||
static uint32_t MaxUnpool3DCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_H_
|
|
@ -0,0 +1,258 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "max_unpool_3d_grad.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 3;
|
||||
constexpr int64_t kParallelDataNums = 1024;
|
||||
const char *kMaxUnpool3DGrad = "MaxUnpool3DGrad";
|
||||
|
||||
#define SWITCH_PARALLEL(SHARD, end_num, ctx) \
|
||||
if (end_num <= kParallelDataNums) { \
|
||||
for (size_t i = 0; i < size_t(end_num); i++) { \
|
||||
SHARD(i, i + 1); \
|
||||
} \
|
||||
} else { \
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), \
|
||||
"MaxUnpool3DGrad #SHARD Compute failed."); \
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename DATA_T>
|
||||
uint32_t MaxUnpool3DGradCpuKernel::MaxUnpool3DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type) {
|
||||
// Compute by indices_type
|
||||
switch (indices_type) {
|
||||
case DT_INT32:
|
||||
return MaxUnpool3DGradCompute<DATA_T, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return MaxUnpool3DGradCompute<DATA_T, int64_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("indices_type [%s] must be in [{DT_INT32, DT_INT64}].", DTypeStr(indices_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t MaxUnpool3DGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MaxUnpool3DGrad check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MaxUnpool3DGradCheck(ctx), "MaxUnpool3DGrad check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
auto indices_type = ctx.Input(2)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_INT8:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<int8_t>(ctx, indices_type);
|
||||
case DT_INT16:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<int16_t>(ctx, indices_type);
|
||||
case DT_INT32:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<int32_t>(ctx, indices_type);
|
||||
case DT_INT64:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<int64_t>(ctx, indices_type);
|
||||
case DT_UINT8:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<uint8_t>(ctx, indices_type);
|
||||
case DT_UINT16:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<uint16_t>(ctx, indices_type);
|
||||
case DT_UINT32:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<uint32_t>(ctx, indices_type);
|
||||
case DT_UINT64:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<uint64_t>(ctx, indices_type);
|
||||
case DT_FLOAT16:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<Eigen::half>(ctx, indices_type);
|
||||
case DT_FLOAT:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<float>(ctx, indices_type);
|
||||
case DT_DOUBLE:
|
||||
return MaxUnpool3DGrad_COMPUTE_CASE<double>(ctx, indices_type);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MaxUnpool3DGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCheck(CpuKernelContext &ctx) {
|
||||
DataType input0Type = ctx.Input(0)->GetDataType();
|
||||
DataType input1Type = ctx.Input(1)->GetDataType();
|
||||
DataType outputType = ctx.Output(0)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0Type == input1Type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input1Type [%d] need be same with "
|
||||
"input0 [%d].",
|
||||
input1Type, input0Type)
|
||||
|
||||
KERNEL_CHECK_FALSE((input0Type == outputType), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of output [%d] need be same with "
|
||||
"input0 [%d].",
|
||||
outputType, input0Type)
|
||||
auto Input0_size = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
auto Input2_size = ctx.Input(2)->GetTensorShape()->GetDimSizes();
|
||||
|
||||
KERNEL_CHECK_FALSE((Input0_size == Input2_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data size of x [%d] need be same with "
|
||||
"input argmax [%d].",
|
||||
Input0_size, Input2_size)
|
||||
|
||||
KERNEL_LOG_INFO(
|
||||
"MaxUnpool3DGradCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], input2: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize(), ctx.Input(2)->GetDataSize(),
|
||||
ctx.Output(0)->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename DATA_T, typename INDICES_T>
|
||||
uint32_t MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute(CpuKernelContext &ctx) {
|
||||
Tensor *grads = ctx.Input(1);
|
||||
Tensor *indices = ctx.Input(2);
|
||||
Tensor *output = ctx.Output(0);
|
||||
std::string dataFormat = "NCDHW";
|
||||
if (ctx.GetAttr("data_format") != nullptr) {
|
||||
dataFormat = ctx.GetAttr("data_format")->GetString();
|
||||
}
|
||||
int32_t NIndex, CIndex, DIndex, HIndex, WIndex;
|
||||
bool error = false;
|
||||
if (dataFormat == "NDHWC") {
|
||||
NIndex = 0;
|
||||
CIndex = 4;
|
||||
DIndex = 1;
|
||||
HIndex = 2;
|
||||
WIndex = 3;
|
||||
|
||||
auto grads_out_shape = grads->GetTensorShape();
|
||||
int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
|
||||
int64_t odepth = grads_out_shape->GetDimSize(DIndex);
|
||||
int64_t oheight = grads_out_shape->GetDimSize(HIndex);
|
||||
int64_t owidth = grads_out_shape->GetDimSize(WIndex);
|
||||
int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
|
||||
auto output_shape = output->GetTensorShape();
|
||||
int64_t idepth = output_shape->GetDimSize(DIndex);
|
||||
int64_t iheight = output_shape->GetDimSize(HIndex);
|
||||
int64_t iwidth = output_shape->GetDimSize(WIndex);
|
||||
auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
|
||||
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
|
||||
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
|
||||
|
||||
for (int s = 0; s < numBatch * iheight * iwidth * idepth * numChannels; s++) {
|
||||
rawOutput[s] = (DATA_T)0;
|
||||
}
|
||||
auto shard = [&](int64_t start, int64_t end) {
|
||||
for (int64_t n = start; n < end; n++) {
|
||||
int64_t nOutputOffset = n * numChannels * iwidth * iheight * idepth;
|
||||
int64_t nGradsOffset = n * numChannels * owidth * oheight * odepth;
|
||||
DATA_T *output_p_k = rawOutput + nOutputOffset;
|
||||
DATA_T *grads_p_k = rawGrads + nGradsOffset;
|
||||
INDICES_T *ind_p_k = rawIndices + nOutputOffset;
|
||||
int64_t maxp;
|
||||
for (int64_t k = 0; k < numChannels; k++) {
|
||||
for (int64_t t = 0; t < idepth; t++) {
|
||||
for (int64_t i = 0; i < iheight; i++) {
|
||||
for (int64_t j = 0; j < iwidth; j++) {
|
||||
maxp = ind_p_k[t * iwidth * iheight * numChannels + i * iwidth * numChannels + j * numChannels + k];
|
||||
if (maxp < 0 || maxp >= owidth * oheight * odepth) {
|
||||
error = true;
|
||||
KERNEL_LOG_ERROR(
|
||||
"MaxUnpool3DGrad: output_size D_out * H_out * W_out "
|
||||
"should be bigger than argmax, now D_out is [%ld], H_out "
|
||||
"is [%ld], and W_out is [%ld], but one of the values in "
|
||||
"argmax is [%ld].",
|
||||
odepth, oheight, owidth, maxp);
|
||||
} else {
|
||||
output_p_k[t * iwidth * iheight * numChannels + i * iwidth * numChannels + j * numChannels + k] =
|
||||
grads_p_k[maxp * numChannels + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard, numBatch, ctx);
|
||||
} else {
|
||||
NIndex = 0;
|
||||
CIndex = 1;
|
||||
DIndex = 2;
|
||||
HIndex = 3;
|
||||
WIndex = 4;
|
||||
|
||||
auto grads_out_shape = grads->GetTensorShape();
|
||||
int64_t numBatch = grads_out_shape->GetDimSize(NIndex);
|
||||
int64_t odepth = grads_out_shape->GetDimSize(DIndex);
|
||||
int64_t oheight = grads_out_shape->GetDimSize(HIndex);
|
||||
int64_t owidth = grads_out_shape->GetDimSize(WIndex);
|
||||
int64_t numChannels = grads_out_shape->GetDimSize(CIndex);
|
||||
auto output_shape = output->GetTensorShape();
|
||||
int64_t idepth = output_shape->GetDimSize(DIndex);
|
||||
int64_t iheight = output_shape->GetDimSize(HIndex);
|
||||
int64_t iwidth = output_shape->GetDimSize(WIndex);
|
||||
auto *rawGrads = reinterpret_cast<DATA_T *>(grads->GetData());
|
||||
auto *rawIndices = reinterpret_cast<INDICES_T *>(indices->GetData());
|
||||
auto *rawOutput = reinterpret_cast<DATA_T *>(output->GetData());
|
||||
for (int s = 0; s < numBatch * idepth * iheight * iwidth * numChannels; s++) {
|
||||
rawOutput[s] = (DATA_T)0;
|
||||
}
|
||||
auto shard = [&](int64_t start, int64_t end) {
|
||||
for (int64_t n = start; n < end; n++) {
|
||||
int64_t nOutputOffset = n * numChannels * iwidth * iheight * idepth;
|
||||
int64_t nGradsOffset = n * numChannels * owidth * oheight * odepth;
|
||||
int64_t k = 0;
|
||||
for (k = 0; k < numChannels; k++) {
|
||||
int64_t finalOutputOffset = nOutputOffset + k * iwidth * iheight * idepth;
|
||||
int64_t finalGradsOffset = nGradsOffset + k * owidth * oheight * odepth;
|
||||
DATA_T *output_p_k = rawOutput + finalOutputOffset;
|
||||
DATA_T *grads_p_k = rawGrads + finalGradsOffset;
|
||||
INDICES_T *ind_p_k = rawIndices + finalOutputOffset;
|
||||
int64_t maxp;
|
||||
for (int64_t t = 0; t < idepth; t++) {
|
||||
for (int64_t i = 0; i < iheight; i++) {
|
||||
for (int64_t j = 0; j < iwidth; j++) {
|
||||
maxp = ind_p_k[t * iheight * iwidth + i * iwidth + j];
|
||||
if (maxp < 0 || maxp >= owidth * oheight * odepth) {
|
||||
error = true;
|
||||
KERNEL_LOG_ERROR(
|
||||
"MaxUnpool3DGrad: output_size D_out * H_out * W_out "
|
||||
"should be bigger than argmax, now D_out is [%ld], H_out "
|
||||
"is [%ld], and W_out is [%ld], but one of the values in "
|
||||
"argmax is [%ld].",
|
||||
odepth, oheight, owidth, maxp);
|
||||
} else {
|
||||
output_p_k[t * iheight * iwidth + i * iwidth + j] = grads_p_k[maxp];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard, numBatch, ctx);
|
||||
}
|
||||
if (error == true) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMaxUnpool3DGrad, MaxUnpool3DGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_GRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
namespace aicpu {
|
||||
class MaxUnpool3DGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MaxUnpool3DGradCpuKernel() = default;
|
||||
~MaxUnpool3DGradCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
static uint32_t MaxUnpool3DGradCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
static uint32_t MaxUnpool3DGrad_COMPUTE_CASE(CpuKernelContext &ctx, DataType indices_type);
|
||||
|
||||
template <typename T, typename S>
|
||||
static uint32_t MaxUnpool3DGradCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_MAX_UNPOOL3D_GRAD_H_
|
|
@ -0,0 +1,429 @@
|
|||
/**
|
||||
* Copyright 2021 Harbin Institute of Technology
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "maxpool_grad.h"
|
||||
|
||||
#include <Eigen/Dense>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/allocator_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
namespace {
|
||||
const char *kMaxPoolGrad = "MaxPoolGrad";
|
||||
constexpr uint32_t kInvalidMaxPoolingIndex = -1;
|
||||
constexpr uint32_t kMaxPoolGradInputNum = 3;
|
||||
constexpr uint32_t kMaxPoolGradOutputNum = 1;
|
||||
constexpr int64_t kParallelNum_7K = 7 * 1024;
|
||||
constexpr int64_t kParallelNum_16K = 16 * 1024;
|
||||
constexpr int64_t kParallelNum_128K = 128 * 1024;
|
||||
constexpr uint32_t kThirdInputIndex = 2;
|
||||
struct PoolParams {
|
||||
int depth;
|
||||
|
||||
int tensor_cols;
|
||||
int tensor_rows;
|
||||
int tensor_batch;
|
||||
|
||||
int ksize_rows;
|
||||
int ksize_cols;
|
||||
int ksize_depth;
|
||||
|
||||
int strides_rows;
|
||||
int strides_cols;
|
||||
int strides_depth;
|
||||
|
||||
int64_t out_height;
|
||||
int64_t out_width;
|
||||
int out_depth;
|
||||
|
||||
int64_t pad_top;
|
||||
int64_t pad_bottom;
|
||||
int64_t pad_left;
|
||||
int64_t pad_right;
|
||||
};
|
||||
} // namespace
|
||||
namespace aicpu {
|
||||
template <typename T, typename Targmax>
|
||||
uint32_t SpatialMaxPoolWithArgMaxHelper(CpuKernelContext &ctx, const PoolParams ¶ms) {
|
||||
bool include_batch_in_index = true;
|
||||
|
||||
Tensor *tensor_in = ctx.Input(kFirstInputIndex);
|
||||
EigenTensor input_eigen_tensor(tensor_in, tensor_in->GetData());
|
||||
Tensor *tensor_out = ctx.Input(kSecondInputIndex);
|
||||
EigenTensor output_eigen_tensor(tensor_out, tensor_out->GetData());
|
||||
Tensor *tensor_out_backprop = ctx.Input(2);
|
||||
EigenTensor out_backprop(tensor_out_backprop, tensor_out_backprop->GetData());
|
||||
Tensor *tensor_output_dup = ctx.Output(kFirstOutputIndex);
|
||||
EigenTensor input_backprop(tensor_output_dup, tensor_output_dup->GetData());
|
||||
|
||||
// create a new aicpu::Tensor
|
||||
auto tensor_out_arg_max_tmp = CpuKernelUtils::CreateTensor();
|
||||
Targmax *arg_max = new Targmax[tensor_output_dup->NumElements()];
|
||||
|
||||
TensorShape out_dup_ts = *(tensor_output_dup->GetTensorShape());
|
||||
tensor_out_arg_max_tmp->SetDataType(DT_INT64);
|
||||
tensor_out_arg_max_tmp->SetData(static_cast<void *>(arg_max));
|
||||
tensor_out_arg_max_tmp->SetDataSize(tensor_output_dup->GetDataSize());
|
||||
|
||||
auto out_arg_max_ts = tensor_out_arg_max_tmp->GetTensorShape();
|
||||
out_arg_max_ts->SetFormat(out_dup_ts.GetFormat());
|
||||
out_arg_max_ts->SetUnknownRank(out_dup_ts.GetUnknownRank());
|
||||
out_arg_max_ts->SetDimSizes(out_dup_ts.GetDimSizes());
|
||||
|
||||
auto tensor_out_arg_max = tensor_out_arg_max_tmp.get();
|
||||
EigenTensor output_arg_max(tensor_out_arg_max, tensor_out_arg_max->GetData());
|
||||
|
||||
typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenMatrixMap;
|
||||
typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> EigenMatrixMap;
|
||||
typedef Eigen::Map<Eigen::Matrix<Targmax, Eigen::Dynamic, Eigen::Dynamic>> EigenIndexMatrixMap;
|
||||
|
||||
ConstEigenMatrixMap in_mat(input_eigen_tensor.flat<T>().data(), params.depth,
|
||||
params.tensor_cols * params.tensor_rows * params.tensor_batch);
|
||||
EigenMatrixMap out_mat(output_eigen_tensor.flat<T>().data(), params.depth,
|
||||
params.out_width * params.out_height * params.tensor_batch);
|
||||
EigenIndexMatrixMap out_arg_max_mat(output_arg_max.flat<Targmax>().data(), params.depth,
|
||||
params.out_width * params.out_height * params.tensor_batch);
|
||||
|
||||
input_backprop.flat<T>().setZero();
|
||||
auto orig_input_ptr = static_cast<T *>(tensor_in->GetData());
|
||||
auto orig_output_ptr = static_cast<T *>(tensor_out->GetData());
|
||||
auto grad_ptr = static_cast<T *>(tensor_out_backprop->GetData());
|
||||
auto output_ptr = static_cast<T *>(tensor_output_dup->GetData());
|
||||
// shard_NCHW's limit is params.tensor_batch * params.depth
|
||||
auto shard_NCHW = [¶ms, &orig_input_ptr, &orig_output_ptr, &grad_ptr, &output_ptr](int64_t start, int64_t limit) {
|
||||
typedef Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenArrayMap;
|
||||
typedef Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> EigenArrayMap;
|
||||
const int64_t X_W = static_cast<int64_t>(params.tensor_cols), X_H = static_cast<int64_t>(params.tensor_rows);
|
||||
const int64_t Y_W = params.out_width, Y_H = params.out_height;
|
||||
const int64_t batch_size = limit;
|
||||
const int64_t X_HxW = X_H * X_W, Y_HxW = Y_H * Y_W;
|
||||
const int64_t X_stride = X_HxW, Y_stride = Y_HxW;
|
||||
const int64_t stride_h = static_cast<int64_t>(params.strides_rows),
|
||||
stride_w = static_cast<int64_t>(params.strides_cols);
|
||||
const int64_t pad_t = params.pad_top, pad_l = params.pad_left;
|
||||
const int64_t kernel_h = static_cast<int64_t>(params.ksize_rows),
|
||||
kernel_w = static_cast<int64_t>(params.ksize_cols);
|
||||
const T *dy_ptr = grad_ptr + start * Y_stride;
|
||||
const T *x_ptr = orig_input_ptr + start * X_stride;
|
||||
const T *y_ptr = orig_output_ptr + start * Y_stride;
|
||||
T *dx_ptr = output_ptr + start * X_stride;
|
||||
for (int64_t i = start; i < batch_size; i++) {
|
||||
ConstEigenArrayMap dy_arr(dy_ptr, Y_W, Y_H);
|
||||
ConstEigenArrayMap x_arr(x_ptr, X_W, X_H);
|
||||
ConstEigenArrayMap y_arr(y_ptr, Y_W, Y_H);
|
||||
EigenArrayMap dx_arr(dx_ptr, X_W, X_H);
|
||||
for (int64_t h = 0; h < Y_H; ++h) {
|
||||
const int64_t t = std::max(h * stride_h - pad_t, static_cast<int64_t>(0));
|
||||
const int64_t b = std::min(h * stride_h - pad_t + kernel_h, X_H);
|
||||
for (int64_t w = 0; w < Y_W; ++w) {
|
||||
const int64_t l = std::max(w * stride_w - pad_l, static_cast<int64_t>(0));
|
||||
const int64_t r = std::min(w * stride_w - pad_l + kernel_w, X_W);
|
||||
const int64_t y = h * Y_W + w;
|
||||
auto some_max_block = (x_arr.block(l, t, r - l, b - t) == y_arr(y)).template cast<T>();
|
||||
int64_t first_max_x_rel = 0, first_max_y_rel = 0;
|
||||
bool max_found = false;
|
||||
for (int64_t by = 0; by < b - t; ++by) {
|
||||
for (int64_t bx = 0; bx < r - l; ++bx) {
|
||||
if (some_max_block(bx, by) == static_cast<T>(1)) {
|
||||
first_max_x_rel = bx, first_max_y_rel = by, max_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (max_found) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const int64_t fact_index_h = t + first_max_y_rel, fact_index_w = l + first_max_x_rel;
|
||||
*(dx_ptr + fact_index_h * X_W + fact_index_w) += static_cast<T>(1) * dy_arr(y);
|
||||
}
|
||||
}
|
||||
dy_ptr += Y_stride;
|
||||
x_ptr += X_stride;
|
||||
y_ptr += Y_stride;
|
||||
dx_ptr += X_stride;
|
||||
}
|
||||
};
|
||||
auto shard = [¶ms, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop, &output_arg_max, &out_backprop,
|
||||
&tensor_out_backprop, include_batch_in_index](int64_t start, int64_t limit) {
|
||||
const int32_t depth = params.depth;
|
||||
const int32_t in_rows = params.tensor_rows;
|
||||
const int32_t in_cols = params.tensor_cols;
|
||||
const int32_t pad_top = params.pad_top;
|
||||
const int32_t pad_left = params.pad_left;
|
||||
const int32_t window_rows = params.ksize_rows;
|
||||
const int32_t window_cols = params.ksize_cols;
|
||||
const int32_t row_stride = params.strides_rows;
|
||||
const int32_t col_stride = params.strides_cols;
|
||||
const int32_t out_height = params.out_height;
|
||||
const int32_t out_width = params.out_width;
|
||||
{
|
||||
const int32_t output_image_size = out_height * out_width * depth;
|
||||
EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 1, (limit - start) * output_image_size);
|
||||
out_shard.setConstant(Eigen::NumTraits<T>::lowest());
|
||||
EigenIndexMatrixMap out_arg_max_shard(out_arg_max_mat.data() + start * output_image_size, 1,
|
||||
(limit - start) * output_image_size);
|
||||
out_arg_max_shard.setConstant(kInvalidMaxPoolingIndex);
|
||||
}
|
||||
|
||||
for (int64_t b = start; b < limit; ++b) {
|
||||
for (int h = 0; h < in_rows; ++h) {
|
||||
for (int w = 0; w < in_cols; ++w) {
|
||||
const int hpad = h + pad_top;
|
||||
const int wpad = w + pad_left;
|
||||
const int h_start = (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1;
|
||||
const int h_end = std::min(hpad / row_stride + 1, out_height);
|
||||
const int w_start = (wpad < window_cols) ? 0 : (wpad - window_cols) / col_stride + 1;
|
||||
const int w_end = std::min(wpad / col_stride + 1, out_width);
|
||||
const int64_t in_index = (b * in_rows + h) * in_cols + w;
|
||||
for (int ph = h_start; ph < h_end; ++ph) {
|
||||
const int64_t out_index_base = (b * out_height + ph) * out_width;
|
||||
for (int pw = w_start; pw < w_end; ++pw) {
|
||||
const int64_t out_index = out_index_base + pw;
|
||||
for (int d = 0; d < depth; ++d) {
|
||||
const T &input_ref = in_mat.coeffRef(d, in_index);
|
||||
T &output_ref = out_mat.coeffRef(d, out_index);
|
||||
Targmax &out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index);
|
||||
if (output_ref < input_ref || out_arg_max_ref == kInvalidMaxPoolingIndex) {
|
||||
output_ref = input_ref;
|
||||
if (include_batch_in_index) {
|
||||
out_arg_max_ref = in_index * depth + d;
|
||||
} else {
|
||||
out_arg_max_ref = (h * in_cols + w) * depth + d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (include_batch_in_index) {
|
||||
auto input_backprop_flat = input_backprop.flat<T>();
|
||||
auto out_arg_max_flat = output_arg_max.flat<int64_t>();
|
||||
auto out_backprop_flat = out_backprop.flat<T>();
|
||||
const int64_t in_size = in_rows * in_cols * depth;
|
||||
const int64_t in_start = start * in_size;
|
||||
const int64_t in_end = limit * in_size;
|
||||
EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1, in_end - in_start);
|
||||
in_shard.setConstant(T(0));
|
||||
|
||||
// Backpropagate.
|
||||
const int out_size = out_height * out_width * depth;
|
||||
const int out_start = start * out_size;
|
||||
const int out_end = limit * out_size;
|
||||
for (int index = out_start; index < out_end; ++index) {
|
||||
int input_backprop_index = out_arg_max_flat(index);
|
||||
// BoundsCheck
|
||||
if (input_backprop_index - in_start >= 0 && input_backprop_index - in_end < 0) {
|
||||
if (index < (tensor_out_backprop->NumElements())) {
|
||||
input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
|
||||
}
|
||||
} else {
|
||||
KERNEL_LOG_ERROR("[MaxPoolGrad] Backpropagate boundsCheck failed");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
};
|
||||
|
||||
const int64_t total_elements = params.tensor_batch * params.tensor_rows * params.tensor_cols * params.depth;
|
||||
if (ctx.GetAttr("data_format") != nullptr && ctx.GetAttr("data_format")->GetString() == "NCHW") {
|
||||
const int64_t total_images = params.tensor_batch * params.depth;
|
||||
if (total_elements <= kParallelNum_16K) {
|
||||
shard_NCHW(0, total_images);
|
||||
return KERNEL_STATUS_OK;
|
||||
} else {
|
||||
return CpuKernelUtils::ParallelFor(ctx, total_images, 1, shard_NCHW);
|
||||
}
|
||||
}
|
||||
uint32_t tensor_batch = params.tensor_batch;
|
||||
if (total_elements <= kParallelNum_7K) {
|
||||
shard(0, params.tensor_batch);
|
||||
return KERNEL_STATUS_OK;
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (total_elements <= kParallelNum_16K) {
|
||||
max_core_num = std::min(max_core_num, 4U);
|
||||
}
|
||||
if (total_elements >= kParallelNum_128K || max_core_num > tensor_batch) {
|
||||
max_core_num = params.tensor_batch;
|
||||
}
|
||||
return CpuKernelUtils::ParallelFor(ctx, params.tensor_batch, params.tensor_batch / max_core_num, shard);
|
||||
}
|
||||
}
|
||||
uint32_t CheckMaxPoolGrad(CpuKernelContext &ctx) {
|
||||
Tensor *tensor_in = ctx.Input(kFirstInputIndex);
|
||||
Tensor *tensor_out = ctx.Input(kSecondInputIndex);
|
||||
Tensor *out_backprop = ctx.Input(kThirdInputIndex);
|
||||
const std::vector<std::string> attr = {"ksize", "strides", "padding"};
|
||||
|
||||
KERNEL_CHECK_FALSE(NormalCheck(ctx, kMaxPoolGradInputNum, kMaxPoolGradOutputNum, attr) == KERNEL_STATUS_OK,
|
||||
KERNEL_STATUS_PARAM_INVALID, "[MaxPoolGrad] NormalCheck input and output failed.");
|
||||
// check tensor_in dims
|
||||
Tensor &input0 = *(tensor_in);
|
||||
auto input_shape_ptr = input0.GetTensorShape();
|
||||
KERNEL_CHECK_FALSE(input_shape_ptr->GetDims() == 4, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Non-empty [4D] tensor expected for input(0).");
|
||||
// check tensor_out dims
|
||||
Tensor &input1 = *(tensor_out);
|
||||
auto output_shape_ptr = input1.GetTensorShape();
|
||||
KERNEL_CHECK_FALSE(output_shape_ptr->GetDims() == 4, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Non-empty [4D] tensor expected for input(1).");
|
||||
// check out_backprop dims
|
||||
Tensor &input2 = *(out_backprop);
|
||||
auto grad_shape_ptr = input2.GetTensorShape();
|
||||
KERNEL_CHECK_FALSE(grad_shape_ptr->GetDims() == 4, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Non-empty [4D] tensor expected for input(2).");
|
||||
// check output data
|
||||
KERNEL_LOG_DEBUG("[MaxPoolGrad] Parameters check pass.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
uint32_t GetOutputSizeGrad(int input_size, int kernel_size, int stride, const std::string &padding,
|
||||
int64_t *output_size, int64_t *padding_before, int64_t *padding_after) {
|
||||
KERNEL_CHECK_FALSE(stride > 0, KERNEL_STATUS_PARAM_INVALID, "[MaxPoolGrad] Stride must be positive.");
|
||||
std::string same("SAME"), valid("VALID");
|
||||
if (valid == padding) {
|
||||
*output_size = (input_size - kernel_size + stride) / stride;
|
||||
*padding_before = 0;
|
||||
*padding_after = 0;
|
||||
} else if (same == padding) {
|
||||
*output_size = (input_size + stride - 1) / stride;
|
||||
const int64_t padding_need =
|
||||
std::max(static_cast<int64_t>(0), (*output_size - 1) * stride + kernel_size - input_size);
|
||||
*padding_before = padding_need / 2;
|
||||
*padding_after = padding_need - *padding_before;
|
||||
} else {
|
||||
KERNEL_LOG_ERROR("[MaxPoolGrad] Padding is invalid.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (*output_size < 0) {
|
||||
KERNEL_LOG_ERROR("[MaxPoolGrad] Computed output size is negative.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
uint32_t ConstructPoolParams(aicpu::CpuKernelContext &ctx, const aicpu::TensorShape &data_format, PoolParams ¶ms) {
|
||||
Format format = data_format.GetFormat();
|
||||
KERNEL_CHECK_FALSE((format == FORMAT_NHWC || format == FORMAT_NCHW), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[MaxPoolGrad] Format is not NHWC or NCHW.");
|
||||
std::vector<int64_t> tensor_in_shapes = data_format.GetDimSizes();
|
||||
std::vector<int64_t> ksize = ctx.GetAttr("ksize")->GetListInt(), strides = ctx.GetAttr("strides")->GetListInt();
|
||||
std::string padding = ctx.GetAttr("padding")->GetString();
|
||||
std::string data_format_str = "";
|
||||
if (ctx.GetAttr("data_format") == nullptr) {
|
||||
KERNEL_LOG_INFO("[MaxPoolGrad] Attr data_format is empty, using default value NHWC.");
|
||||
format = FORMAT_NHWC;
|
||||
} else {
|
||||
std::map<std::string, aicpu::Format> format_str_to_enum_map = {{"NHWC", FORMAT_NHWC}, {"NCHW", FORMAT_NCHW}};
|
||||
data_format_str = ctx.GetAttr("data_format")->GetString();
|
||||
|
||||
KERNEL_HANDLE_ERROR(format_str_to_enum_map.find(data_format_str) == format_str_to_enum_map.end(),
|
||||
"[MaxPoolGrad] data_format string is invalid.");
|
||||
format = format_str_to_enum_map[data_format_str];
|
||||
}
|
||||
switch (format) {
|
||||
case FORMAT_NHWC:
|
||||
params.depth = tensor_in_shapes[kFormatNHWCIndexC];
|
||||
params.tensor_rows = tensor_in_shapes[kFormatNHWCIndexH];
|
||||
params.tensor_cols = tensor_in_shapes[kFormatNHWCIndexW];
|
||||
params.tensor_batch = tensor_in_shapes[kFormatNHWCIndexN];
|
||||
params.ksize_rows = ksize[kFormatNHWCIndexH];
|
||||
params.ksize_cols = ksize[kFormatNHWCIndexW];
|
||||
params.ksize_depth = ksize[kFormatNHWCIndexC];
|
||||
params.strides_rows = strides[kFormatNHWCIndexH];
|
||||
params.strides_cols = strides[kFormatNHWCIndexW];
|
||||
params.strides_depth = strides[kFormatNHWCIndexC];
|
||||
break;
|
||||
case FORMAT_NCHW:
|
||||
params.depth = tensor_in_shapes[kFormatNCHWIndexC];
|
||||
params.tensor_rows = tensor_in_shapes[kFormatNCHWIndexH];
|
||||
params.tensor_cols = tensor_in_shapes[kFormatNCHWIndexW];
|
||||
params.tensor_batch = tensor_in_shapes[kFormatNCHWIndexN];
|
||||
params.ksize_rows = ksize[kFormatNCHWIndexH];
|
||||
params.ksize_cols = ksize[kFormatNCHWIndexW];
|
||||
params.ksize_depth = ksize[kFormatNCHWIndexC];
|
||||
params.strides_rows = strides[kFormatNCHWIndexH];
|
||||
params.strides_cols = strides[kFormatNCHWIndexW];
|
||||
params.strides_depth = strides[kFormatNCHWIndexC];
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[MaxPoolGrad] Format is not NHWC or NCHW, current is [%d].", format);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
// 1 types of pooling is supported: 2d pooling on w/h
|
||||
// depth pooling on channel is not supported
|
||||
KERNEL_CHECK_FALSE(params.ksize_depth == 1, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[MaxPoolGrad] Only pooling on width/height is supported.");
|
||||
// Padding calc
|
||||
if (params.ksize_depth == 1) {
|
||||
uint32_t ret1 = GetOutputSizeGrad(params.tensor_rows, params.ksize_rows, params.strides_rows, padding,
|
||||
¶ms.out_height, ¶ms.pad_top, ¶ms.pad_bottom);
|
||||
uint32_t ret2 = GetOutputSizeGrad(params.tensor_cols, params.ksize_cols, params.strides_cols, padding,
|
||||
¶ms.out_width, ¶ms.pad_left, ¶ms.pad_right);
|
||||
KERNEL_CHECK_FALSE(ret1 == KERNEL_STATUS_OK && ret2 == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[MaxPoolGrad] An error occurred while calculating output size.");
|
||||
params.out_depth = params.depth;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
template <class T>
|
||||
uint32_t ComputeMaxPoolGradImpl(CpuKernelContext &ctx) {
|
||||
TensorShape ts = *(ctx.Input(kFirstInputIndex)->GetTensorShape());
|
||||
PoolParams params;
|
||||
KERNEL_CHECK_FALSE(ConstructPoolParams(ctx, ts, params) == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[MaxPoolGrad] Parameters construct failed.")
|
||||
return SpatialMaxPoolWithArgMaxHelper<T, int64_t>(ctx, params);
|
||||
}
|
||||
uint32_t MaxPoolGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_CHECK_FALSE(CheckMaxPoolGrad(ctx) == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[MaxPoolGrad] Parameters check failure.");
|
||||
DataType input_type = ctx.Input(kFirstInputIndex)->GetDataType();
|
||||
switch (input_type) {
|
||||
case DT_FLOAT16:
|
||||
return ComputeMaxPoolGradImpl<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return ComputeMaxPoolGradImpl<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return ComputeMaxPoolGradImpl<double>(ctx);
|
||||
case DT_INT8:
|
||||
return ComputeMaxPoolGradImpl<int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return ComputeMaxPoolGradImpl<int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return ComputeMaxPoolGradImpl<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return ComputeMaxPoolGradImpl<int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return ComputeMaxPoolGradImpl<uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return ComputeMaxPoolGradImpl<uint16_t>(ctx);
|
||||
case DT_UINT32:
|
||||
return ComputeMaxPoolGradImpl<uint32_t>(ctx);
|
||||
case DT_UINT64:
|
||||
return ComputeMaxPoolGradImpl<uint64_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[MaxPoolGrad] Input Data type [%s] is not supported.", DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kMaxPoolGrad, MaxPoolGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,30 @@
|
|||
/**
|
||||
* Copyright 2021 Harbin Institute of Technology
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MAX_POOL_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MAX_POOL_GRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MaxPoolGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~MaxPoolGradCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,253 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "mirror_pad.h"
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "Eigen/Dense"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/equal_util.h"
|
||||
#include "utils/kernel_util.h"
|
||||
namespace {
|
||||
constexpr uint32_t kMirrotPadInputNum = 2;
|
||||
constexpr uint32_t kMirrotPadOutputNum = 1;
|
||||
const char *kMirrorPad = "MirrorPad";
|
||||
constexpr int kMinDims = 0;
|
||||
constexpr int kMaxDims = 5;
|
||||
constexpr int kTwo = 2;
|
||||
std::vector<std::string> attr_names;
|
||||
std::vector<int64_t> input_dim_shape;
|
||||
std::vector<int64_t> output_dim_shape;
|
||||
std::vector<std::pair<int64_t, int64_t>> padding_;
|
||||
std::vector<uint64_t> input_strides_;
|
||||
std::vector<uint64_t> output_strides_;
|
||||
int64_t input_num_elements;
|
||||
int64_t output_num_elements;
|
||||
int32_t dims_;
|
||||
int64_t offset_;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
uint32_t MirrorPadCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
attr_names.emplace_back("mode");
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMirrotPadInputNum, kMirrotPadOutputNum, attr_names),
|
||||
"[%s] check params failed.", kMirrorPad);
|
||||
// get Attr mode
|
||||
AttrValue *mode_ptr = ctx.GetAttr("mode");
|
||||
auto mode = mode_ptr->GetString();
|
||||
KERNEL_CHECK_FALSE((mode == "SYMMETRIC" || mode == "REFLECT"), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Attr mode must be either REFLECT or SYMMETRIC, but got attr mode[%s]", mode);
|
||||
if (mode == "SYMMETRIC") {
|
||||
offset_ = 0;
|
||||
} else if (mode == "REFLECT") {
|
||||
offset_ = 1;
|
||||
}
|
||||
// get input x
|
||||
Tensor *x_ptr = ctx.Input(0);
|
||||
data_type_ = x_ptr->GetDataType();
|
||||
auto x_shape_ptr = x_ptr->GetTensorShape();
|
||||
auto dims = x_shape_ptr->GetDims();
|
||||
dims_ = x_shape_ptr->GetDims();
|
||||
KERNEL_CHECK_FALSE((kMinDims <= dims && dims <= kMaxDims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"inputs rank not in [%lld, %lld]: %lld", kMinDims, kMaxDims, dims);
|
||||
// get input paddings
|
||||
Tensor *paddings_ptr = ctx.Input(1);
|
||||
auto paddings_shape_ptr = paddings_ptr->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((paddings_ptr->GetDataType() == DT_INT32 || paddings_ptr->GetDataType() == DT_INT64),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input split_dim data type must be DT_INT32 or DT_INT64, "
|
||||
"but got data type[%s]",
|
||||
DTypeStr(paddings_ptr->GetDataType()).c_str());
|
||||
KERNEL_CHECK_FALSE(IsMatrix(paddings_shape_ptr->GetDimSizes()) && paddings_shape_ptr->GetDimSize(1),
|
||||
KERNEL_STATUS_PARAM_INVALID, "paddings must be a matrix with 2 columns: [%lld] ",
|
||||
paddings_shape_ptr->GetDimSizes());
|
||||
KERNEL_CHECK_FALSE(dims == paddings_shape_ptr->GetDimSize(0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The first dimension of paddings must be the rank of inputs [%lld] , "
|
||||
"[%lld]",
|
||||
x_shape_ptr->GetDimSizes(), paddings_shape_ptr->GetDimSizes());
|
||||
// Compute the shape of the output tensor, and allocate it.
|
||||
auto size_pads_data = reinterpret_cast<T *>(paddings_ptr->GetData());
|
||||
input_num_elements = 1;
|
||||
output_num_elements = 1;
|
||||
for (int d = 0; d < dims_; ++d) {
|
||||
int64_t before = *(size_pads_data + d * 2);
|
||||
int64_t after = *(size_pads_data + d * 2 + 1);
|
||||
padding_.push_back(std::make_pair(before, after));
|
||||
KERNEL_CHECK_FALSE(before >= 0 && after >= 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"paddings must be non-negative: [%lld] [%lld]", before, after);
|
||||
if (offset_ == 0) {
|
||||
KERNEL_CHECK_FALSE(before <= x_shape_ptr->GetDimSize(d) && after <= x_shape_ptr->GetDimSize(d),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"paddings must be no greater "
|
||||
"than the dimension size: [%lld] , [%lld] greater than [%lld] ",
|
||||
before, after, x_shape_ptr->GetDimSize(d));
|
||||
} else if (offset_ == 1) {
|
||||
KERNEL_CHECK_FALSE(before < x_shape_ptr->GetDimSize(d) && after < x_shape_ptr->GetDimSize(d),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"paddings must be no greater "
|
||||
"than the dimension size: [%lld] , [%lld] not less than [%lld] ",
|
||||
before, after, x_shape_ptr->GetDimSize(d));
|
||||
}
|
||||
input_dim_shape.push_back(x_shape_ptr->GetDimSize(d));
|
||||
int64_t dimi = after + x_shape_ptr->GetDimSize(d) + before;
|
||||
input_num_elements *= x_shape_ptr->GetDimSize(d);
|
||||
output_num_elements *= dimi;
|
||||
output_dim_shape.push_back(dimi);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MirrorPadCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
auto input_data_ptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
if (output_num_elements == ctx.Input(0)->NumElements() || dims_ == 0) {
|
||||
uint64_t copy_size = ctx.Input(0)->GetDataSize();
|
||||
auto mem_ret = memcpy_s(output_data, copy_size, input_data_ptr, copy_size);
|
||||
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Memcpy size[%zu] from input value to output failed.", copy_size);
|
||||
} else {
|
||||
KERNEL_CHECK_FALSE((MirrorPadCompute<T>(input_data_ptr, output_data) == KERNEL_STATUS_OK),
|
||||
KERNEL_STATUS_PARAM_INVALID, "MirrorPadCompute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MirrorPadCpuKernel::MirrorPadCompute(T *input_data_ptr, T *output_data_ptr) {
|
||||
input_strides_.resize(dims_);
|
||||
output_strides_.resize(dims_);
|
||||
input_strides_[dims_ - 1] = 1;
|
||||
output_strides_[dims_ - 1] = 1;
|
||||
for (int i = dims_ - 1; i > 0; --i) {
|
||||
input_strides_[i - 1] = input_strides_[i] * input_dim_shape[i];
|
||||
output_strides_[i - 1] = output_strides_[i] * output_dim_shape[i];
|
||||
}
|
||||
std::vector<std::pair<int64_t, int64_t>> index;
|
||||
index.resize(dims_);
|
||||
index[dims_ - 1] = std::make_pair(output_strides_[dims_ - 1] * padding_[dims_ - 1].first,
|
||||
output_strides_[dims_ - 1] * padding_[dims_ - 1].second);
|
||||
for (int i = dims_ - 1; i > 0; --i) {
|
||||
index[i - 1].first = index[i].first + output_strides_[i - 1] * padding_[i - 1].first;
|
||||
index[i - 1].second = index[i].second + output_strides_[i - 1] * padding_[i - 1].second;
|
||||
}
|
||||
if (dims_ == 1) {
|
||||
memcpy_s(output_data_ptr, padding_[0].first * sizeof(T), input_data_ptr + offset_, padding_[0].first * sizeof(T));
|
||||
memcpy_s(output_data_ptr + padding_[0].first + input_num_elements, padding_[0].second * sizeof(T),
|
||||
input_data_ptr + input_num_elements - padding_[0].second - offset_, padding_[0].second * sizeof(T));
|
||||
memcpy_s(output_data_ptr + padding_[0].first, input_num_elements * sizeof(T), input_data_ptr,
|
||||
input_num_elements * sizeof(T));
|
||||
std::reverse(output_data_ptr, output_data_ptr + padding_[0].first);
|
||||
std::reverse(output_data_ptr + padding_[0].first + input_num_elements,
|
||||
output_data_ptr + padding_[0].first + input_num_elements + padding_[0].second);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
std::vector<int64_t> pos;
|
||||
std::vector<int64_t> output_pos, tmp_pos;
|
||||
pos.resize(dims_ - 1, 0);
|
||||
int64_t output_index = index[0].first;
|
||||
int64_t inx = 0, copy_size = sizeof(T) * input_dim_shape[dims_ - 1];
|
||||
while (inx < input_num_elements) {
|
||||
memcpy_s(output_data_ptr + output_index, copy_size, input_data_ptr + inx, copy_size);
|
||||
output_pos.push_back(output_index);
|
||||
pos[dims_ - kTwo] += 1;
|
||||
int64_t dep = dims_ - 1;
|
||||
for (int64_t i = dims_ - 2; i >= 0; --i) {
|
||||
if (i > 0 && pos[i] >= input_dim_shape[i]) {
|
||||
pos[i] -= input_dim_shape[i];
|
||||
pos[i - 1] += 1;
|
||||
dep = i;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
output_index += index[dep].first + index[dep].second + input_dim_shape[dims_ - 1];
|
||||
inx += input_dim_shape[dims_ - 1];
|
||||
}
|
||||
for (int64_t i = dims_ - 1; i >= 0; --i) {
|
||||
int64_t block_size = output_strides_[i], count = 0;
|
||||
copy_size = block_size * sizeof(T);
|
||||
for (auto item : output_pos) {
|
||||
T *base_output_ptr1 = output_data_ptr + item;
|
||||
for (int64_t cnt = 1; cnt <= padding_[i].first; ++cnt) {
|
||||
memcpy_s(base_output_ptr1 - cnt * block_size, copy_size, base_output_ptr1 + (cnt - 1 + offset_) * block_size,
|
||||
copy_size);
|
||||
}
|
||||
T *base_output_ptr2 = output_data_ptr + item + input_dim_shape[i] * block_size;
|
||||
for (int64_t cnt = 1; cnt <= padding_[i].second; ++cnt) {
|
||||
memcpy_s(base_output_ptr2 + (cnt - 1) * block_size, copy_size, base_output_ptr2 - (cnt + offset_) * block_size,
|
||||
copy_size);
|
||||
}
|
||||
if (i > 0 && count % input_dim_shape[i - 1] == 0) {
|
||||
tmp_pos.push_back(item - padding_[i].first * block_size);
|
||||
}
|
||||
++count;
|
||||
}
|
||||
output_pos.clear();
|
||||
for (auto item : tmp_pos) {
|
||||
output_pos.push_back(item);
|
||||
}
|
||||
tmp_pos.clear();
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MirrorPadCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
auto padding_type_ = ctx.Input(1)->GetDataType();
|
||||
if (padding_type_ == DT_INT32) {
|
||||
KERNEL_CHECK_FALSE((CheckAndInitParams<int32_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"CheckAndInitParams failed.");
|
||||
} else {
|
||||
KERNEL_CHECK_FALSE((CheckAndInitParams<int64_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"CheckAndInitParams failed.");
|
||||
}
|
||||
switch (data_type_) {
|
||||
case DT_FLOAT16:
|
||||
return DoCompute<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return DoCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<double>(ctx);
|
||||
case DT_BOOL:
|
||||
return DoCompute<bool>(ctx);
|
||||
case DT_INT8:
|
||||
return DoCompute<int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return DoCompute<int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return DoCompute<uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return DoCompute<uint16_t>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return DoCompute<std::complex<float>>(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return DoCompute<std::complex<double>>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported datatype[%s]", DTypeStr(data_type_).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMirrorPad, MirrorPadCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,64 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MIRROR_PAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MIRROR_PAD_H_
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "securec.h"
|
||||
#include "status.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
|
||||
namespace aicpu {
|
||||
class MirrorPadCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MirrorPadCpuKernel() = default;
|
||||
~MirrorPadCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Init params
|
||||
* @param ctx cpu kernel context
|
||||
* @return status if success
|
||||
*/
|
||||
template <typename T>
|
||||
uint32_t CheckAndInitParams(CpuKernelContext &ctx);
|
||||
|
||||
/**
|
||||
* @brief padding
|
||||
* @param input_data_ptr ptr which store input data
|
||||
* @param output_data_ptr ptr which store output data
|
||||
* @return status if success
|
||||
*/
|
||||
template <typename T>
|
||||
uint32_t MirrorPadCompute(T *input_data_ptr, T *output_data_ptr);
|
||||
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
|
||||
private:
|
||||
DataType data_type_;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,320 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "multi_margin_loss.h"
|
||||
|
||||
#include <Eigen/Dense>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kMultiMarginLoss = "MultiMarginLoss";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 28 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MultiMarginLossCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
uint32_t kInputNum = 3;
|
||||
constexpr int SERV_TYPE_SET = 2;
|
||||
if (ctx.GetInputsSize() == SERV_TYPE_SET) {
|
||||
kInputNum = SERV_TYPE_SET;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MultiMarginLoss check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MultiMarginLossCheck(ctx), "MultiMarginLoss check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT16:
|
||||
return MultiMarginLossComputeFP16<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return MultiMarginLossCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return MultiMarginLossCompute<double>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MultiMarginLoss kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MultiMarginLossCpuKernel::MultiMarginLossCheck(CpuKernelContext &ctx) {
|
||||
auto input_0 = ctx.Input(0);
|
||||
auto input_1 = ctx.Input(1);
|
||||
|
||||
constexpr int SERV_TYPE_SET = 2;
|
||||
constexpr int SERV_TYPE_QUERY = 3;
|
||||
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input1_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of target [%s] should be int64.", DTypeStr(input1_type).c_str())
|
||||
auto target = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
|
||||
int64_t target_num = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
|
||||
int64_t batch_size = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
if (ctx.GetInputsSize() == SERV_TYPE_QUERY) {
|
||||
auto input_weight = ctx.Input(2);
|
||||
DataType input2_type = input_weight->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input2_type == input0_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"weight should have the same dtype with x, but get [%s].", DTypeStr(input2_type).c_str())
|
||||
}
|
||||
KERNEL_CHECK_FALSE((ctx.Input(0)->GetTensorShape()->GetDims() == SERV_TYPE_SET), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Rank of x should be 2.")
|
||||
KERNEL_CHECK_FALSE((ctx.Input(1)->GetTensorShape()->GetDims() == 1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Rank of target should be 1.")
|
||||
KERNEL_CHECK_FALSE((batch_size == ctx.Input(1)->GetTensorShape()->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%s] 's x's shape[0] should be the same as target's "
|
||||
"shape[0].",
|
||||
ctx.GetOpType().c_str())
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
KERNEL_CHECK_FALSE(*(target + i) >= 0 && (*(target + i) < target_num), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%s]'s target out of range", ctx.GetOpType().c_str());
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MultiMarginLossCpuKernel::MultiMarginLossCompute(CpuKernelContext &ctx) {
|
||||
constexpr int SERV_TYPE_BRWD = 1;
|
||||
constexpr int SERV_TYPE_SET = 2;
|
||||
constexpr int ADULT_AGE = 4;
|
||||
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_target = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input_target = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
|
||||
T *input_weight = nullptr;
|
||||
bool weight_defined_ = (ctx.GetInputsSize() == 3);
|
||||
if (weight_defined_) {
|
||||
input_weight = reinterpret_cast<T *>(ctx.Input(2)->GetData());
|
||||
int64_t weight_length = ctx.Input(2)->NumElements();
|
||||
int64_t x_length = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
|
||||
if (weight_length < x_length) {
|
||||
for (int64_t i = 0; i < x_length - weight_length; i++) {
|
||||
input_weight[i + weight_length] = static_cast<T>(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
AttrValue *Attr_p = ctx.GetAttr("p");
|
||||
int p = (Attr_p == nullptr) ? 1 : Attr_p->GetInt();
|
||||
if (p != SERV_TYPE_BRWD && p != SERV_TYPE_SET) {
|
||||
KERNEL_LOG_ERROR("MultiMarginLoss kernel attr p should be 1 or 2.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
AttrValue *Attr_margin = ctx.GetAttr("margin");
|
||||
T margin = static_cast<T>((Attr_margin == nullptr) ? 1 : Attr_margin->GetFloat());
|
||||
AttrValue *Attr_red = ctx.GetAttr("reduction");
|
||||
std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
|
||||
int64_t batch_size = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
int64_t dims = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
|
||||
Eigen::Array<T, Eigen::Dynamic, 1> output(batch_size, 1);
|
||||
output.setZero();
|
||||
auto output_data = output.data();
|
||||
int64_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, (int64_t)aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
auto shard_multi_margin_loss = [&](size_t start, size_t end) {
|
||||
int64_t once_compute_thread_size = end - start;
|
||||
Eigen::Array<T, Eigen::Dynamic, 1> cacl(dims, 1);
|
||||
auto cacl_data = cacl.data();
|
||||
cacl.setZero();
|
||||
if (dims == 0) {
|
||||
KERNEL_LOG_ERROR("dims could not be 0.");
|
||||
}
|
||||
for (int64_t m = 0; m < (once_compute_thread_size) / dims; m++) {
|
||||
int64_t i = start / dims;
|
||||
for (int64_t d = 0; d < dims; d++) {
|
||||
if (d == input_target[i]) {
|
||||
continue;
|
||||
}
|
||||
cacl_data[d] = margin + input_x[start + d] - input_x[start + input_target[i]];
|
||||
if (cacl_data[d] > T(0)) {
|
||||
cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
|
||||
if (weight_defined_) {
|
||||
cacl_data[d] *= (input_weight[input_target[i]]);
|
||||
}
|
||||
output_data[i] += cacl_data[d];
|
||||
}
|
||||
}
|
||||
output_data[i] = output_data[i] / static_cast<T>(dims);
|
||||
start += dims;
|
||||
}
|
||||
};
|
||||
if ((ctx.Input(0)->NumElements()) * sizeof(T) <= kParallelDataNum) {
|
||||
Eigen::Array<T, Eigen::Dynamic, 1> cacl(dims, 1);
|
||||
auto cacl_data = cacl.data();
|
||||
cacl.setZero();
|
||||
T sum = static_cast<T>(0);
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
int64_t target_idx = input_target[i];
|
||||
sum = static_cast<T>(0);
|
||||
cacl.setZero();
|
||||
for (int64_t d = 0; d < dims; d++) {
|
||||
if (d == target_idx) {
|
||||
continue;
|
||||
}
|
||||
cacl_data[d] = margin + input_x[i * dims + d] - input_x[i * dims + target_idx];
|
||||
if (cacl_data[d] > T(0)) {
|
||||
cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
|
||||
if (weight_defined_) {
|
||||
cacl_data[d] *= static_cast<T>(input_weight[target_idx]);
|
||||
}
|
||||
sum += cacl_data[d];
|
||||
}
|
||||
}
|
||||
sum = sum / static_cast<T>(dims);
|
||||
output_data[i] = sum;
|
||||
}
|
||||
} else {
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
CpuKernelUtils::ParallelFor(ctx, ctx.Input(0)->NumElements(), dims * ADULT_AGE * (batch_size / max_core_num + 1),
|
||||
shard_multi_margin_loss);
|
||||
}
|
||||
if (reduction == "mean") {
|
||||
*output_y = output.mean();
|
||||
}
|
||||
if (reduction == "sum") {
|
||||
*output_y = output.sum();
|
||||
}
|
||||
if (reduction == "none") {
|
||||
for (int64_t t = 0; t < batch_size; t++) {
|
||||
*(output_y + t) = output_data[t];
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MultiMarginLossCpuKernel::MultiMarginLossComputeFP16(CpuKernelContext &ctx) {
|
||||
constexpr int SERV_TYPE_BRWD = 1;
|
||||
constexpr int SERV_TYPE_SET = 2;
|
||||
constexpr int ADULT_AGE = 4;
|
||||
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_target = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input_target = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
|
||||
T *input_weight = nullptr;
|
||||
bool weight_defined_ = (ctx.GetInputsSize() == 3);
|
||||
if (weight_defined_) {
|
||||
input_weight = reinterpret_cast<T *>(ctx.Input(SERV_TYPE_SET)->GetData());
|
||||
int64_t weight_length = ctx.Input(2)->NumElements();
|
||||
int64_t x_length = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
|
||||
if (weight_length < x_length) {
|
||||
for (int64_t i = 0; i < x_length - weight_length; i++) {
|
||||
input_weight[i + weight_length] = static_cast<T>(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
AttrValue *Attr_p = ctx.GetAttr("p");
|
||||
int p = (Attr_p == nullptr) ? 1 : Attr_p->GetInt();
|
||||
if (p != SERV_TYPE_BRWD && p != SERV_TYPE_SET) {
|
||||
KERNEL_LOG_ERROR("MultiMarginLoss kernel attr p should be 1 or 2.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
AttrValue *Attr_margin = ctx.GetAttr("margin");
|
||||
float margin = static_cast<float>((Attr_margin == nullptr) ? 1 : Attr_margin->GetFloat());
|
||||
AttrValue *Attr_red = ctx.GetAttr("reduction");
|
||||
std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
|
||||
int64_t batch_size = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
int64_t dims = ctx.Input(0)->GetTensorShape()->GetDimSize(1);
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> output(batch_size, 1);
|
||||
output.setZero();
|
||||
auto output_data = output.data();
|
||||
int64_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, (int64_t)aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
auto shard_multi_margin_loss = [&](size_t start, size_t end) {
|
||||
int64_t once_compute_thread_size = end - start;
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> cacl(dims, 1);
|
||||
auto cacl_data = cacl.data();
|
||||
cacl.setZero();
|
||||
if (dims == 0) {
|
||||
KERNEL_LOG_ERROR("dims could not be 0.");
|
||||
}
|
||||
for (int64_t m = 0; m < (once_compute_thread_size) / dims; m++) {
|
||||
int64_t i = start / dims;
|
||||
for (int64_t d = 0; d < dims; d++) {
|
||||
if (d == input_target[i]) {
|
||||
continue;
|
||||
}
|
||||
cacl_data[d] =
|
||||
margin + static_cast<float>(input_x[start + d]) - static_cast<float>(input_x[start + input_target[i]]);
|
||||
if (cacl_data[d] > 0) {
|
||||
cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
|
||||
if (weight_defined_) {
|
||||
cacl_data[d] *= static_cast<float>(input_weight[input_target[i]]);
|
||||
}
|
||||
output_data[i] += cacl_data[d];
|
||||
}
|
||||
}
|
||||
output_data[i] = output_data[i] / static_cast<float>(dims);
|
||||
start += dims;
|
||||
}
|
||||
};
|
||||
if ((ctx.Input(0)->NumElements()) * sizeof(T) <= kParallelDataNum) {
|
||||
Eigen::Array<float, Eigen::Dynamic, 1> cacl(dims, 1);
|
||||
auto cacl_data = cacl.data();
|
||||
cacl.setZero();
|
||||
float sum = 0;
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
int64_t target_idx = input_target[i];
|
||||
sum = 0;
|
||||
cacl.setZero();
|
||||
for (int64_t d = 0; d < dims; d++) {
|
||||
if (d == target_idx) {
|
||||
continue;
|
||||
}
|
||||
cacl_data[d] =
|
||||
margin + static_cast<float>(input_x[i * dims + d]) - static_cast<float>(input_x[i * dims + target_idx]);
|
||||
if (cacl_data[d] > 0) {
|
||||
cacl_data[d] = (p == 1) ? cacl_data[d] : cacl_data[d] * cacl_data[d];
|
||||
if (weight_defined_) {
|
||||
cacl_data[d] *= static_cast<float>(input_weight[target_idx]);
|
||||
}
|
||||
sum += cacl_data[d];
|
||||
}
|
||||
}
|
||||
sum = sum / static_cast<float>(dims);
|
||||
output_data[i] = sum;
|
||||
}
|
||||
} else {
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
CpuKernelUtils::ParallelFor(ctx, ctx.Input(0)->NumElements(), dims * ADULT_AGE * (batch_size / max_core_num + 1),
|
||||
shard_multi_margin_loss);
|
||||
}
|
||||
if (reduction == "mean") {
|
||||
*output_y = static_cast<T>(output.mean());
|
||||
}
|
||||
if (reduction == "sum") {
|
||||
*output_y = static_cast<T>(output.sum());
|
||||
}
|
||||
if (reduction == "none") {
|
||||
for (int64_t t = 0; t < batch_size; t++) {
|
||||
*(output_y + t) = static_cast<T>(output_data[t]);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMultiMarginLoss, MultiMarginLossCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MULIT_MARGIN_LOSS_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MULTI_MARGIN_LOSS_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MultiMarginLossCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MultiMarginLossCpuKernel() = default;
|
||||
~MultiMarginLossCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
static uint32_t MultiMarginLossCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
static uint32_t MultiMarginLossCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
static uint32_t MultiMarginLossComputeFP16(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
Loading…
Reference in New Issue