!48020 migrate PadV3 and other aicpu ops

Merge pull request !48020 from 李林杰/0118_block_apicpu_ops_that_might_have_issues
This commit is contained in:
i-robot 2023-01-20 01:17:25 +00:00 committed by Gitee
commit 77442c99a2
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
11 changed files with 1336 additions and 6 deletions

View File

@ -539,6 +539,8 @@ constexpr auto kPadAndShiftOpName = "PadAndShift";
constexpr auto kPaddingOpName = "Padding";
constexpr auto kPadOpName = "Pad";
constexpr auto kPadDOpName = "PadD";
constexpr auto kPadV3GradOpName = "PadV3Grad";
constexpr auto kPadV3OpName = "PadV3";
constexpr auto kParallelResizeBilinearOpName = "ParallelResizeBilinear";
constexpr auto kSyncResizeBilinearV2OpName = "SyncResizeBilinearV2";
constexpr auto kParallelResizeBilinearGradOpName = "ParallelResizeBilinearGrad";

View File

@ -0,0 +1,198 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "logical_xor.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kLogicalXor = "LogicalXor";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
} // namespace
namespace aicpu {
uint32_t LogicalXorCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LogicalXor check input and output number failed.");
KERNEL_HANDLE_ERROR(LogicalXorCheck(ctx), "LogicalXor check params or bcast failed.");
uint32_t result = LogicalXorCompute<bool>(ctx);
if (result != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("LogicalXor kernel compute failed.");
return result;
}
return KERNEL_STATUS_OK;
}
uint32_t LogicalXorCpuKernel::LogicalXorCheck(CpuKernelContext &ctx) {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type && input0_type == DT_BOOL), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s] and both should be bool.",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"LogicalXorCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
/**
* special compute is used in the following situations.
* 1. the shapes of input1 and input2 are the same
* 2. input1 is a 1D tensor with only one element or input1 is scalar
* 3. input2 is a 1D tensor with only one element or input2 is scalar
* 4. the shapes of input1 and input2 are different
*/
template <typename T>
void LogicalXorCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
const T *input2, bool *output) {
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = start; i < end; ++i) {
*(output + i) = *(input1 + i) != *(input2 + i);
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(output + i) = *input1 != *(input2 + i);
}
break;
case BcastShapeType::Y_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(output + i) = *(input1 + i) != *input2;
}
break;
default:
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
}
template <typename T>
uint32_t LogicalXorCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
auto input_0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input_1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<bool *>(ctx.Output(0)->GetData());
int64_t input_0_elements_nums = ctx.Input(0)->NumElements();
int64_t input_1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type =
input_0_elements_nums == input_1_elements_nums
? BcastShapeType::SAME_SHAPE
: (input_0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_LogicalXor = [&](int64_t start, int64_t end) {
SpecialCompute<T>(type, start, end, input_0, input_1, out);
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_LogicalXor),
"LogicalXor Compute failed.");
} else {
SpecialCompute<T>(type, 0, data_num, input_0, input_1, out);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t LogicalXorCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
auto input_0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto input_1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<bool *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_LogicalXor = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
*(out + i) =
*(input_0 + bcast.GetBroadcastXIndex(i)) != *(input_1 + bcast.GetBroadcastYIndex(i)) ? true : false;
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_LogicalXor),
"LogicalXor Compute failed.");
} else {
for (int64_t i = 0; i < data_num; ++i) {
*(out + i) = *(input_0 + bcast.GetBroadcastXIndex(i)) != *(input_1 + bcast.GetBroadcastYIndex(i)) ? true : false;
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t LogicalXorCpuKernel::LogicalXorCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (isNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kLogicalXor, LogicalXorCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,49 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LOGICALXOR_H_
#define AICPU_KERNELS_NORMALIZED_LOGICALXOR_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class LogicalXorCpuKernel : public CpuKernel {
public:
LogicalXorCpuKernel() = default;
~LogicalXorCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
static uint32_t LogicalXorCheck(CpuKernelContext &ctx);
template <typename T>
void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, bool *output);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
uint32_t LogicalXorCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,533 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "pad_v3.h"
#include <algorithm>
#include <array>
#include <iostream>
#include <vector>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kPadV3 = "PadV3";
constexpr int64_t kMinCoreNum = 1;
constexpr int64_t kParallelNum = 1024 * 16;
constexpr int64_t kInput3D = 3;
constexpr int64_t kInput4D = 4;
constexpr int64_t kInput5D = 5;
constexpr int64_t kPadding1D = 2;
constexpr int64_t kPadding2D = 4;
constexpr int64_t kPadding3D = 6;
constexpr int64_t kNum2 = 2;
constexpr int64_t kNum3 = 3;
constexpr int64_t kNum4 = 4;
const std::vector<std::string> mode_list = {"constant", "reflect", "edge"};
using float16 = Eigen::half;
#define PAD_V3_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = DoCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("PadV3 kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t PadV3CpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input x failed")
KERNEL_CHECK_NULLPTR(ctx.Input(1)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input paddings failed")
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output y failed")
KERNEL_HANDLE_ERROR(CheckAndInitParams(ctx), "PadV3 check and init params failed.");
auto paddings_type = ctx.Input(1)->GetDataType();
if (paddings_type == DT_INT32) {
KERNEL_CHECK_FALSE((GetPaddingsAndSetOuputShape<int32_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"Get paddings and set output shape failed.");
} else if (paddings_type == DT_INT64) {
KERNEL_CHECK_FALSE((GetPaddingsAndSetOuputShape<int64_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"Get paddings and set output shape failed.");
} else {
KERNEL_LOG_ERROR("PadV3 paddings data type [%s] not support.", DTypeStr(paddings_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto data_type_ = ctx.Input(0)->GetDataType();
switch (data_type_) {
PAD_V3_COMPUTE_CASE(DT_INT8, int8_t, ctx)
PAD_V3_COMPUTE_CASE(DT_INT16, int16_t, ctx)
PAD_V3_COMPUTE_CASE(DT_INT32, int32_t, ctx)
PAD_V3_COMPUTE_CASE(DT_INT64, int64_t, ctx)
PAD_V3_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
PAD_V3_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
PAD_V3_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
PAD_V3_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
PAD_V3_COMPUTE_CASE(DT_FLOAT16, float16, ctx)
PAD_V3_COMPUTE_CASE(DT_FLOAT, float, ctx)
PAD_V3_COMPUTE_CASE(DT_DOUBLE, double, ctx)
PAD_V3_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
PAD_V3_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("PadV3 kernel data type [%s] not support.", DTypeStr(data_type_).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
int64_t PadV3CpuKernel::EdgeIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start,
int64_t i_start) {
int64_t ip;
if (now < pad_value) {
ip = pad_value;
} else if (now >= pad_value && now < input_value + pad_value) {
ip = now;
} else {
ip = input_value + pad_value - 1;
}
ip = ip - o_start + i_start;
return ip;
}
template <typename T>
uint32_t PadV3CpuKernel::EdgeCompute1D(T *input, T *output, int64_t p) {
int64_t nplane = 0;
int64_t input_w = input_shape[kNum2];
int64_t output_w = output_shape.end()[-1];
int64_t pad_l = paddings[0];
int64_t i_start_x = std::max(int64_t(0), -pad_l);
int64_t o_start_x = std::max(int64_t(0), pad_l);
int64_t ip_x;
for (int64_t j = 0; j < output_w; ++j) {
ip_x = EdgeIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
T *dest_p = output + p * output_w * (nplane + 1) + j;
T *src_p = input + +p * input_w * (nplane + 1) + ip_x;
*dest_p = *src_p;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3CpuKernel::EdgeCompute2D(T *input, T *output, int64_t p) {
int64_t pad_l = paddings[0];
int64_t pad_t = paddings[kNum2];
int64_t nplane = 0;
int64_t input_h = input_shape[kNum2];
int64_t input_w = input_shape[kNum3];
int64_t output_h = input_h + pad_t + paddings[kNum3];
int64_t output_w = input_w + pad_l + paddings[1];
int64_t i_start_x = std::max(int64_t(0), -pad_l);
int64_t i_start_y = std::max(int64_t(0), -pad_t);
int64_t o_start_x = std::max(int64_t(0), pad_l);
int64_t o_start_y = std::max(int64_t(0), pad_t);
int64_t ip_x, ip_y;
for (int64_t i = 0; i < output_h; ++i) {
for (int64_t j = 0; j < output_w; ++j) {
ip_x = EdgeIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
ip_y = EdgeIndexCaculate(pad_t, i, input_h, o_start_y, i_start_y);
T *dest_p = output + p * output_w * output_h * (nplane + 1) + i * output_w + j;
T *src_p = input + p * input_w * input_h * (nplane + 1) + ip_y * input_w + ip_x;
*dest_p = *src_p;
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3CpuKernel::EdgeCompute3D(T *input, T *output, int64_t p) {
int64_t pad_l = paddings[0];
int64_t pad_t = paddings[kNum2];
int64_t pad_f = paddings[kNum4];
int64_t nplane = 0;
int64_t input_d = input_shape[kNum2];
int64_t input_h = input_shape[kNum3];
int64_t input_w = input_shape[kNum4];
int64_t output_d = output_shape[kNum2];
int64_t output_h = output_shape[kNum3];
int64_t output_w = output_shape[kNum4];
int64_t i_start_x = std::max(int64_t(0), -pad_l);
int64_t i_start_y = std::max(int64_t(0), -pad_t);
int64_t i_start_z = std::max(int64_t(0), -pad_f);
int64_t o_start_x = std::max(int64_t(0), pad_l);
int64_t o_start_y = std::max(int64_t(0), pad_t);
int64_t o_start_z = std::max(int64_t(0), pad_f);
int64_t ip_x, ip_y, ip_z;
for (int64_t k = 0; k < output_d; ++k) {
for (int64_t j = 0; j < output_h; ++j) {
for (int64_t i = 0; i < output_w; ++i) {
ip_x = EdgeIndexCaculate(pad_l, i, input_w, o_start_x, i_start_x);
ip_y = EdgeIndexCaculate(pad_t, j, input_h, o_start_y, i_start_y);
ip_z = EdgeIndexCaculate(pad_f, k, input_d, o_start_z, i_start_z);
T *dest_p =
output + p * output_w * output_h * output_d * (nplane + 1) + k * output_w * output_h + j * output_w + i;
T *src_p =
input + p * input_w * input_h * input_d * (nplane + 1) + ip_z * input_w * input_h + ip_y * input_w + ip_x;
*dest_p = *src_p;
}
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3CpuKernel::EdgeModeCompute(CpuKernelContext &ctx, int64_t p) {
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
if (paddings_num == kPadding1D) {
EdgeCompute1D<T>(input, output, p);
} else if (paddings_num == kPadding2D) {
EdgeCompute2D<T>(input, output, p);
} else if (paddings_num == kPadding3D) {
EdgeCompute3D<T>(input, output, p);
}
return KERNEL_STATUS_OK;
}
int64_t PadV3CpuKernel::ReflectIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start,
int64_t i_start) {
int64_t ip;
if (now < pad_value) {
ip = pad_value + pad_value - now;
} else if (now >= pad_value && now < input_value + pad_value) {
ip = now;
} else {
ip = (input_value + pad_value - 1) + (input_value + pad_value - 1) - now;
}
ip = ip - o_start + i_start;
return ip;
}
template <typename T>
uint32_t PadV3CpuKernel::ReflectCompute1D(T *input, T *output, int64_t p) {
int64_t nplane = 0;
int64_t input_w = input_shape[kNum2];
int64_t output_w = output_shape.end()[-1];
int64_t pad_l = paddings[0];
int64_t i_start_x = std::max(int64_t(0), -pad_l);
int64_t o_start_x = std::max(int64_t(0), pad_l);
int64_t ip_x;
for (int64_t j = 0; j < output_w; ++j) {
ip_x = ReflectIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
T *dest_p = output + p * output_w * (nplane + 1) + j;
T *src_p = input + +p * input_w * (nplane + 1) + ip_x;
*dest_p = *src_p;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3CpuKernel::ReflectCompute2D(T *input, T *output, int64_t p) {
int64_t pad_l = paddings[0];
int64_t pad_t = paddings[kNum2];
int64_t nplane = 0;
int64_t input_h = input_shape[kNum2];
int64_t input_w = input_shape[kNum3];
int64_t output_h = input_h + pad_t + paddings[kNum3];
int64_t output_w = input_w + pad_l + paddings[1];
int64_t i_start_x = std::max(int64_t(0), -pad_l);
int64_t i_start_y = std::max(int64_t(0), -pad_t);
int64_t o_start_x = std::max(int64_t(0), pad_l);
int64_t o_start_y = std::max(int64_t(0), pad_t);
int64_t ip_x, ip_y;
for (int64_t i = 0; i < output_h; ++i) {
for (int64_t j = 0; j < output_w; ++j) {
ip_x = ReflectIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
ip_y = ReflectIndexCaculate(pad_t, i, input_h, o_start_y, i_start_y);
T *dest_p = output + p * output_w * output_h * (nplane + 1) + i * output_w + j;
T *src_p = input + p * input_w * input_h * (nplane + 1) + ip_y * input_w + ip_x;
*dest_p = *src_p;
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3CpuKernel::ReflectCompute3D(T *input, T *output, int64_t p) {
int64_t pad_l = paddings[0];
int64_t pad_t = paddings[kNum2];
int64_t pad_f = paddings[kNum4];
int64_t nplane = 0;
int64_t input_d = input_shape[kNum2];
int64_t input_h = input_shape[kNum3];
int64_t input_w = input_shape[kNum4];
int64_t output_d = output_shape[kNum2];
int64_t output_h = output_shape[kNum3];
int64_t output_w = output_shape[kNum4];
int64_t i_start_x = std::max(int64_t(0), -pad_l);
int64_t i_start_y = std::max(int64_t(0), -pad_t);
int64_t i_start_z = std::max(int64_t(0), -pad_f);
int64_t o_start_x = std::max(int64_t(0), pad_l);
int64_t o_start_y = std::max(int64_t(0), pad_t);
int64_t o_start_z = std::max(int64_t(0), pad_f);
int64_t ip_x, ip_y, ip_z;
for (int64_t k = 0; k < output_d; ++k) {
for (int64_t j = 0; j < output_h; ++j) {
for (int64_t i = 0; i < output_w; ++i) {
ip_x = ReflectIndexCaculate(pad_l, i, input_w, o_start_x, i_start_x);
ip_y = ReflectIndexCaculate(pad_t, j, input_h, o_start_y, i_start_y);
ip_z = ReflectIndexCaculate(pad_f, k, input_d, o_start_z, i_start_z);
T *dest_p =
output + p * output_w * output_h * output_d * (nplane + 1) + k * output_w * output_h + j * output_w + i;
T *src_p =
input + p * input_w * input_h * input_d * (nplane + 1) + ip_z * input_w * input_h + ip_y * input_w + ip_x;
*dest_p = *src_p;
}
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3CpuKernel::ReflectModeCompute(CpuKernelContext &ctx, int64_t p) {
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
if (paddings_num == kPadding1D) {
ReflectCompute1D<T>(input, output, p);
} else if (paddings_num == kPadding2D) {
ReflectCompute2D<T>(input, output, p);
} else if (paddings_num == kPadding3D) {
ReflectCompute3D<T>(input, output, p);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3CpuKernel::ConstantModeCompute(CpuKernelContext &ctx, T constant_values) {
auto input_ptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_ptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t output_num = ctx.Output(0)->NumElements();
int64_t input_num = 1;
std::vector<int64_t> input_strides(input_dims, 0);
std::vector<int64_t> output_strides(input_dims, 0);
input_strides[input_dims - 1] = 1;
output_strides[input_dims - 1] = 1;
for (int64_t i = input_dims - 1; i >= 1; --i) {
input_strides[i - 1] = input_strides[i] * input_shape[i];
output_strides[i - 1] = output_strides[i] * output_shape[i];
}
std::vector<int64_t> offsets(input_dims, 0);
std::vector<int64_t> extents(input_dims, 0);
for (int64_t i = input_dims - 1; i >= 0; --i) {
extents[i] = input_shape[i];
if (paddings[i * kNum2] < 0) {
extents[i] += paddings[i * kNum2];
offsets[i] = -paddings[i * kNum2];
paddings[i * kNum2] = 0;
}
if (paddings[i * kNum2 + 1] < 0) {
extents[i] += paddings[i * kNum2 + 1];
paddings[i * kNum2 + 1] = 0;
}
input_shape[i] = extents[i];
input_num *= input_shape[i];
}
std::vector<T> input_values;
for (int64_t i = 0; i < input_num; ++i) {
int64_t k = i;
int64_t p = 0;
for (int64_t j = input_dims - 1; j >= 0; --j) {
p += (offsets[j] + (k % extents[j])) * input_strides[j];
k /= extents[j];
}
input_values.push_back(*(input_ptr + p));
}
for (int64_t i = 0; i < output_num; ++i) {
*(output_ptr + i) = constant_values;
}
if (input_dims == 1) {
for (int64_t i = 0; i < input_num; ++i) {
*(output_ptr + paddings[0] + i) = input_values[i];
}
return KERNEL_STATUS_OK;
}
std::vector<int64_t> i_inx_add(input_dims, 0);
std::vector<int64_t> o_inx_add(input_dims, 0);
i_inx_add[input_dims - 1] = output_strides[input_dims - 1] * paddings[kNum2 * (input_dims - 1)];
o_inx_add[input_dims - 1] = output_strides[input_dims - 1] * paddings[kNum2 * (input_dims - 1) + 1];
for (int64_t i = input_dims - 1; i >= 1; --i) {
i_inx_add[i - 1] = i_inx_add[i] + output_strides[i - 1] * paddings[kNum2 * (i - 1)];
o_inx_add[i - 1] = o_inx_add[i] + output_strides[i - 1] * paddings[kNum2 * (i - 1) + 1];
}
int64_t i_inx = 0;
int64_t o_inx = i_inx_add[0];
std::vector<int64_t> pos(input_dims - 1, 0);
while (i_inx < input_num) {
for (int64_t i = 0; i < input_shape[input_dims - 1]; ++i) {
*(output_ptr + o_inx + i) = input_values[i_inx + i];
}
pos[input_dims - kNum2] += 1;
int64_t dep = input_dims - 1;
for (int64_t i = input_dims - kNum2; i >= 0; --i) {
if (i > 0 && pos[i] >= input_shape[i]) {
pos[i] -= input_shape[i];
pos[i - 1] += 1;
dep = i;
} else {
break;
}
}
o_inx += i_inx_add[dep] + o_inx_add[dep] + input_shape[input_dims - 1];
i_inx += input_shape[input_dims - 1];
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3CpuKernel::DoCompute(CpuKernelContext &ctx) {
if (mode == "constant") {
T constant_values = static_cast<T>(0);
if (ctx.Input(kNum2) != nullptr) {
constant_values = *(reinterpret_cast<T *>(ctx.Input(kNum2)->GetData()));
} else {
KERNEL_LOG_DEBUG("Get attr [constant_values] failed, use default value [0]");
}
for (int64_t i = 0; i < input_dims / kNum2; ++i) {
int64_t u = paddings[i * kNum2];
int64_t v = paddings[i * kNum2 + 1];
paddings[i * kNum2] = paddings[kNum2 * (input_dims - i - 1)];
paddings[i * kNum2 + 1] = paddings[kNum2 * (input_dims - i - 1) + 1];
paddings[kNum2 * (input_dims - i - 1)] = u;
paddings[kNum2 * (input_dims - i - 1) + 1] = v;
}
ConstantModeCompute<T>(ctx, constant_values);
} else if (mode == "reflect") {
auto shard_padv3_reflcet = [&](int64_t start, int64_t end) {
for (int p = start; p < end; p++) {
ReflectModeCompute<T>(ctx, p);
}
};
const int64_t data_num = parallelSliceNum;
const bool enable_parallel = data_num > kParallelNum;
if (enable_parallel) {
const int64_t max_core_num =
std::max(static_cast<int64_t>(kMinCoreNum), static_cast<int64_t>(aicpu::CpuKernelUtils::GetCPUNum(ctx)));
const int64_t per_unit_size = data_num / std::min(data_num, max_core_num);
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, per_unit_size, shard_padv3_reflcet),
"PadV3 Compute failed.");
} else {
shard_padv3_reflcet(0, data_num);
}
} else if (mode == "edge") {
auto shard_padv3_edge = [&](int64_t start, int64_t end) {
for (int p = start; p < end; p++) {
EdgeModeCompute<T>(ctx, p);
}
};
const int64_t data_num = parallelSliceNum;
const bool enable_parallel = data_num > kParallelNum;
if (enable_parallel) {
const int64_t max_core_num =
std::max(static_cast<int64_t>(kMinCoreNum), static_cast<int64_t>(aicpu::CpuKernelUtils::GetCPUNum(ctx)));
const int64_t per_unit_size = data_num / std::min(data_num, max_core_num);
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, per_unit_size, shard_padv3_edge),
"PadV3 Compute failed.");
} else {
shard_padv3_edge(0, data_num);
}
}
return KERNEL_STATUS_OK;
}
uint32_t PadV3CpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
if (ctx.GetAttr("mode") == nullptr) {
mode = "constant";
KERNEL_LOG_DEBUG("Get attr [mode] failed, use default value [constant]");
} else {
mode = ctx.GetAttr("mode")->GetString();
const bool is_mode_available = std::find(mode_list.begin(), mode_list.end(), mode) != mode_list.end();
if (is_mode_available == false) {
KERNEL_LOG_ERROR(
"Attr [mode] must be included in [constant, reflect, edge], but got "
"[%s]",
mode.c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
if (ctx.GetAttr("paddings_contiguous") != nullptr) {
paddings_contiguous = ctx.GetAttr("paddings_contiguous")->GetBool();
} else {
paddings_contiguous = true;
KERNEL_LOG_DEBUG("Get attr [paddings_contiguous] failed, use default value [true]");
}
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("Tensor y dtype[%s] must be same with x dtype[%s]", DTypeStr(ctx.Output(0)->GetDataType()).c_str(),
DTypeStr(ctx.Input(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
input_dims = ctx.Input(0)->GetTensorShape()->GetDims();
const std::vector<int64_t> paddings_shape = ctx.Input(1)->GetTensorShape()->GetDimSizes();
paddings_num = ctx.Input(1)->NumElements();
KERNEL_CHECK_FALSE(paddings_shape.size() == 1 && paddings_num == input_dims * kNum2, KERNEL_STATUS_PARAM_INVALID,
"Paddings shape is not supported");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3CpuKernel::GetPaddingsAndSetOuputShape(CpuKernelContext &ctx) {
auto paddings_ptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
paddings = std::vector<int64_t>(input_dims * kNum2, 0);
for (int64_t i = 0; i < paddings_num; i += kNum2) {
paddings[i] = static_cast<int64_t>(paddings_ptr[paddings_num - i - kNum2]);
paddings[i + 1] = static_cast<int64_t>(paddings_ptr[paddings_num - i - 1]);
}
if (mode == "edge" || mode == "reflect" || (mode == "constant" && paddings_contiguous == false)) {
paddings_num = paddings_num - kNum4;
}
if (paddings_contiguous == false) {
std::vector<int64_t> tmp = paddings;
for (int64_t i = 0; i < paddings_num; ++i) {
if (i % kNum2 == 0) {
paddings[i] = tmp[i / kNum2];
} else {
paddings[i] = tmp[(i + paddings_num) / kNum2];
}
}
}
input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
parallelSliceNum = 1;
for (int64_t i = 0; i < input_dims - paddings_num / kNum2; ++i) {
parallelSliceNum *= input_shape[i];
}
for (int64_t i = 0; i < paddings_num / kNum2; ++i) {
output_shape.end()[-(i + 1)] += (paddings[i * kNum2] + paddings[i * kNum2 + 1]);
KERNEL_CHECK_FALSE(output_shape.end()[-(i + 1)] > 0, KERNEL_STATUS_PARAM_INVALID,
"output_shape number must be greater than 0");
KERNEL_CHECK_FALSE(input_shape.end()[-(i + 1)] >= std::max(-paddings[i * kNum2], -paddings[i * kNum2 + 1]),
KERNEL_STATUS_PARAM_INVALID,
"Padding size should be less than the corresponding input dimension");
if (mode == "reflect") {
KERNEL_CHECK_FALSE(input_shape.end()[-(i + 1)] > std::max(paddings[i * kNum2], paddings[i * kNum2 + 1]),
KERNEL_STATUS_PARAM_INVALID,
"Padding size should be less than the corresponding input dimension");
}
}
if (output_shape != ctx.Output(0)->GetTensorShape()->GetDimSizes()) {
ctx.Output(0)->GetTensorShape()->SetDimSizes(output_shape);
KERNEL_LOG_DEBUG("Set output tensor shape success, num elements:[%llu]",
static_cast<uint64_t>(ctx.Output(0)->NumElements()));
} else {
KERNEL_LOG_DEBUG("Output tensor is a const tensor, num elements:[%llu]",
static_cast<uint64_t>(ctx.Output(0)->NumElements()));
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kPadV3, PadV3CpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,89 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_PAD_V3_H_
#define AICPU_KERNELS_NORMALIZED_PAD_V3_H_
#include <memory>
#include <utility>
#include <vector>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "securec.h"
#include "status.h"
#include "utils/bcast.h"
namespace aicpu {
class PadV3CpuKernel : public CpuKernel {
public:
PadV3CpuKernel() = default;
~PadV3CpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
std::vector<int64_t> paddings;
std::vector<int64_t> input_shape;
std::vector<int64_t> output_shape;
std::string mode;
bool paddings_contiguous;
int64_t input_dims{0};
int64_t paddings_num{0};
int64_t parallelSliceNum{1};
uint32_t CheckAndInitParams(CpuKernelContext &ctx);
template <typename T>
uint32_t GetPaddingsAndSetOuputShape(CpuKernelContext &ctx);
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t EdgeModeCompute(CpuKernelContext &ctx, int64_t p);
template <typename T>
uint32_t EdgeCompute3D(T *input, T *output, int64_t p);
template <typename T>
uint32_t EdgeCompute2D(T *input, T *output, int64_t p);
template <typename T>
uint32_t EdgeCompute1D(T *input, T *output, int64_t p);
int64_t EdgeIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start, int64_t i_start);
template <typename T>
uint32_t ReflectModeCompute(CpuKernelContext &ctx, int64_t p);
template <typename T>
uint32_t ReflectCompute3D(T *input, T *output, int64_t p);
template <typename T>
uint32_t ReflectCompute2D(T *input, T *output, int64_t p);
template <typename T>
uint32_t ReflectCompute1D(T *input, T *output, int64_t p);
int64_t ReflectIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start, int64_t i_start);
template <typename T>
uint32_t ConstantModeCompute(CpuKernelContext &ctx, T constant_values);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,367 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "pad_v3_grad.h"
#include <algorithm>
#include <array>
#include <iostream>
#include <vector>
#include "securec.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kPadV3Grad = "PadV3Grad";
constexpr uint32_t kInputNum = 2;
constexpr uint32_t kOutputNum = 1;
constexpr int64_t kParallelNum = 1024 * 64;
const int64_t k3DNum = 6;
const int64_t k2DNum = 4;
const int64_t k1DNum = 2;
constexpr int64_t kpad_l = 0;
constexpr int64_t kpad_t = 2;
constexpr int64_t kpad_f = 4;
constexpr int64_t kwidth = 1;
constexpr int64_t kheight = 2;
constexpr int64_t kchannel = 3;
constexpr int64_t kInput1Dim = 3;
constexpr int64_t kInput2Dim = 4;
constexpr int64_t kInput3Dim = 5;
constexpr int64_t k2Num = 2;
constexpr int64_t k3Num = 3;
constexpr int64_t k4Num = 4;
const std::vector<std::string> mode_list = {"reflect", "edge"};
using float16 = Eigen::half;
#define PAD_V3_GRAD_READ_PADDINGS(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result1 = PadV3ReadPaddingsAndSetOutputShape1<TYPE>(CTX); \
uint32_t result2 = PadV3ReadPaddingsAndSetOutputShape2<TYPE>(CTX); \
if (result1 != KERNEL_STATUS_OK || result2 != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("PadV3Grad kernel compute failed."); \
return result1 && result2; \
} \
break; \
}
#define PAD_V3_GRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = PadV3GradCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("PadV3Grad kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t PadV3GradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(PadV3GradCheck(ctx), "PadV3Grad check params failed.");
auto paddings_type = ctx.Input(1)->GetDataType();
switch (paddings_type) {
PAD_V3_GRAD_READ_PADDINGS(DT_INT32, int32_t, ctx)
PAD_V3_GRAD_READ_PADDINGS(DT_INT64, int64_t, ctx)
default:
KERNEL_LOG_ERROR("PadV3Grad paddings data type [%s] not support.", DTypeStr(paddings_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto data_type = ctx.Output(0)->GetDataType();
switch (data_type) {
PAD_V3_GRAD_COMPUTE_CASE(DT_INT8, int8_t, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_INT16, int16_t, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_INT32, int32_t, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_INT64, int64_t, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_FLOAT16, float16, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
PAD_V3_GRAD_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("PadV3Grad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t PadV3GradCpuKernel::PadV3GradCheck(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "PadV3Grad check failed.");
if (ctx.GetAttr("paddings_contiguous") == nullptr) {
padding_contiguous = true;
KERNEL_LOG_DEBUG("Get attr [paddings_contiguous] failed, use default value [true]");
} else {
padding_contiguous = ctx.GetAttr("paddings_contiguous")->GetBool();
}
if (ctx.GetAttr("mode") == nullptr) {
mode = "reflect";
KERNEL_LOG_DEBUG("Get attr [mode] failed, use default value [reflect]");
} else {
mode = ctx.GetAttr("mode")->GetString();
const bool is_mode_available = std::find(mode_list.begin(), mode_list.end(), mode) != mode_list.end();
if (is_mode_available == false) {
KERNEL_LOG_ERROR("Attr [mode] must be included in [reflect, edge], but got [%s]", mode.c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("Tensor y dtype[%s] must be same with x dtype[%s]", DTypeStr(ctx.Output(0)->GetDataType()).c_str(),
DTypeStr(ctx.Input(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
const std::vector<int64_t> paddings_shape = ctx.Input(1)->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE(
paddings_shape.size() == 1 && (paddings_shape[0] == k3DNum + k4Num || paddings_shape[0] == k2DNum + k4Num ||
paddings_shape[0] == k1DNum + k4Num || paddings_shape[0] == 1),
KERNEL_STATUS_PARAM_INVALID, "Paddings shape is not supported");
KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() >= kInput1Dim, KERNEL_STATUS_PARAM_INVALID,
"Dims of tensor x should be greater than or equal to 3");
KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() <= kInput3Dim, KERNEL_STATUS_PARAM_INVALID,
"Only 3D, 4D, 5D padding with non-constant padding are "
"supported for now");
const int64_t input_dim = ctx.Input(0)->GetTensorShape()->GetDims();
const int64_t num_elem = ctx.Input(1)->NumElements();
KERNEL_CHECK_FALSE(num_elem % k2Num == 0 || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
"Padding length must be divisible by 2");
if (input_dim == kInput1Dim) {
KERNEL_CHECK_FALSE(num_elem == k1DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
"3D tensors expect 6 values for padding");
} else if (input_dim == kInput2Dim) {
KERNEL_CHECK_FALSE(num_elem == k2DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
"4D tensors expect 8 values for padding");
} else if (input_dim == kInput3Dim) {
KERNEL_CHECK_FALSE(num_elem == k3DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
"5D tensors expect 10 values for padding");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3GradCpuKernel::PadV3ReadPaddingsAndSetOutputShape1(CpuKernelContext &ctx) {
num_elem = ctx.Input(1)->NumElements();
input_dim = ctx.Input(0)->GetTensorShape()->GetDims();
const std::vector<int64_t> input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
auto paddings_ptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
paddings = std::vector<int64_t>(input_dim * k2Num, 0);
for (int64_t i = 0; i < num_elem; i += k2Num) {
paddings[i] = static_cast<int64_t>(paddings_ptr[num_elem - i - k2Num]);
paddings[i + 1] = static_cast<int64_t>(paddings_ptr[num_elem - i - 1]);
}
num_elem = num_elem - k4Num;
if (num_elem == 1) {
num_elem = k2Num * (input_dim - k2Num);
for (int64_t i = 0; i < k2Num * (input_dim - k2Num); ++i) {
paddings[i] = static_cast<int64_t>(paddings_ptr[0]);
}
}
parallelSliceNum = 1;
for (int64_t i = 0; i < input_dim - num_elem / k2Num; i++) {
parallelSliceNum *= input_shape[i];
}
if (padding_contiguous == false && num_elem == k3DNum) {
std::vector<int64_t> tmp = paddings;
paddings[1] = tmp[k3Num];
paddings[k2Num] = tmp[1];
paddings[k3Num] = tmp[k4Num];
paddings[k4Num] = tmp[k2Num];
}
if (padding_contiguous == false && num_elem == k2DNum) {
std::vector<int64_t> tmp = paddings;
paddings[1] = tmp[k2Num];
paddings[k2Num] = tmp[1];
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3GradCpuKernel::PadV3ReadPaddingsAndSetOutputShape2(CpuKernelContext &ctx) {
std::vector<int64_t> output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
output_shape.end()[-kwidth] -= (paddings[kpad_l] + paddings[kpad_l + 1]);
output_shape.end()[-kheight] -= (paddings[kpad_t] + paddings[kpad_t + 1]);
output_shape.end()[-kchannel] -= (paddings[kpad_f] + paddings[kpad_f + 1]);
KERNEL_CHECK_FALSE(
output_shape.end()[-kwidth] > 0 && output_shape.end()[-kheight] > 0 && output_shape.end()[-kchannel] > 0,
KERNEL_STATUS_PARAM_INVALID, "output_shape number must be greater than 0");
if (output_shape != ctx.Output(0)->GetTensorShape()->GetDimSizes()) {
ctx.Output(0)->GetTensorShape()->SetDimSizes(output_shape);
KERNEL_LOG_DEBUG("Set output tensor shape success, num elements:[%llu]",
static_cast<uint64_t>(ctx.Output(0)->NumElements()));
} else {
KERNEL_LOG_DEBUG("Output tensor is a const tensor, num elements:[%llu]",
static_cast<uint64_t>(ctx.Output(0)->NumElements()));
}
const std::string padding_contiguous_str = padding_contiguous ? std::string("True") : std::string("False");
KERNEL_LOG_DEBUG(
"PadV3GradCpuKernel[%s], x: size[%llu] dtype[%s], "
"paddings: size[%llu] dtype[%s], y: size[%llu] dtype[%s], mode: [%s], "
"padding_contiguous: [%s].",
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), DTypeStr(ctx.Input(0)->GetDataType()).c_str(),
ctx.Input(1)->GetDataSize(), DTypeStr(ctx.Input(1)->GetDataType()).c_str(), ctx.Output(0)->GetDataSize(),
DTypeStr(ctx.Output(0)->GetDataType()).c_str(), mode.c_str(), padding_contiguous_str.c_str());
return KERNEL_STATUS_OK;
}
int64_t PadV3GradCpuKernel::IndexCaculate(int64_t pad_value, int64_t now, int64_t output_value, int64_t o_start,
int64_t i_start) {
int64_t ip = 0;
if (now < pad_value) {
if (mode == "reflect") {
ip = pad_value + pad_value - now;
} else if (mode == "edge") {
ip = pad_value;
}
} else if (now >= pad_value && now < output_value + pad_value) {
ip = now;
} else {
if (mode == "reflect") {
ip = (output_value + pad_value - 1) + (output_value + pad_value - 1) - now;
} else if (mode == "edge") {
ip = output_value + pad_value - 1;
}
}
ip = ip - o_start + i_start;
return ip;
}
template <typename T>
uint32_t PadV3GradCpuKernel::PadV3GradCompute1(T *input, T *output, int64_t p) {
if (num_elem == k1DNum) {
PadV3GradCompute1D<T>(input, output, p);
} else if (num_elem == k2DNum) {
for (int i = 0; i < input_h; i++) {
PadV3GradCompute2D<T>(input, output, p, i);
}
} else if (num_elem == k3DNum) {
for (int z = 0; z < input_c; z++) {
PadV3GradCompute3D<T>(input, output, p, z);
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3GradCpuKernel::PadV3GradCompute1D(T *input, T *output, int64_t p) {
int ip_x;
for (int j = 0; j < input_w; j++) {
ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x);
T *src_p = input + p * input_w + j;
T *dest_p = output + p * output_w + ip_x;
*dest_p += *src_p;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3GradCpuKernel::PadV3GradCompute2D(T *input, T *output, int64_t p, int64_t i) {
int ip_x, ip_y;
for (int j = 0; j < input_w; j++) {
ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x);
ip_y = IndexCaculate(pad_t, i, output_h, o_start_y, i_start_y);
T *src_p = input + p * input_w * input_h + i * input_w + j;
T *dest_p = output + p * output_w * output_h + ip_y * output_w + ip_x;
*dest_p += *src_p;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3GradCpuKernel::PadV3GradCompute3D(T *input, T *output, int64_t p, int64_t z) {
int ip_x, ip_y, ip_z;
for (int i = 0; i < input_h; i++) {
for (int j = 0; j < input_w; j++) {
ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x);
ip_y = IndexCaculate(pad_t, i, output_h, o_start_y, i_start_y);
ip_z = IndexCaculate(pad_f, z, output_c, o_start_z, i_start_z);
T *src_p = input + p * input_w * input_h * input_c + z * input_w * input_h + i * input_w + j;
T *dest_p = output + p * output_w * output_h * output_c + ip_z * output_w * output_h + ip_y * output_w + ip_x;
*dest_p += *src_p;
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PadV3GradCpuKernel::PadV3GradCompute(CpuKernelContext &ctx) {
const std::vector<int64_t> input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> output_shape = ctx.Output(0)->GetTensorShape()->GetDimSizes();
T *input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
T *output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
output_w = output_shape.end()[-kwidth];
output_h = output_shape.end()[-kheight];
output_c = output_shape.end()[-kchannel];
input_w = input_shape.end()[-kwidth];
input_h = input_shape.end()[-kheight];
input_c = input_shape.end()[-kchannel];
i_start_x = std::max(int64_t(0), -paddings[kpad_l]);
i_start_y = std::max(int64_t(0), -paddings[kpad_t]);
i_start_z = std::max(int64_t(0), -paddings[kpad_f]);
o_start_x = std::max(int64_t(0), paddings[kpad_l]);
o_start_y = std::max(int64_t(0), paddings[kpad_t]);
o_start_z = std::max(int64_t(0), paddings[kpad_f]);
pad_l = paddings[kpad_l];
pad_t = paddings[kpad_t];
pad_f = paddings[kpad_f];
int64_t output_num_ = 1;
for (int64_t i = 0; i < input_dim; i++) {
output_num_ *= output_shape[i];
}
auto ret = memset_s(output, sizeof(T) * output_num_, 0, sizeof(T) * output_num_);
if (ret != EOK) {
KERNEL_LOG_ERROR("memset_s error, ret=%d", ret);
return KERNEL_STATUS_INNER_ERROR;
}
auto shard_padv3_grad = [&](int64_t start, int64_t end) {
for (int p = start; p < end; p++) {
PadV3GradCompute1<T>(input, output, p);
}
};
const int64_t data_num = parallelSliceNum;
const bool enable_parallel = parallelSliceNum > kParallelNum;
if (enable_parallel) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_padv3_grad),
"PadV3Grad Compute failed.");
} else {
for (int p = 0; p < data_num; p++) {
PadV3GradCompute1<T>(input, output, p);
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kPadV3Grad, PadV3GradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,81 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_PAD_V3_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_PAD_V3_GRAD_H_
#include <vector>
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class PadV3GradCpuKernel : public CpuKernel {
public:
PadV3GradCpuKernel() = default;
~PadV3GradCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
bool padding_contiguous = true;
std::string mode = "reflect";
std::vector<int64_t> paddings;
int64_t output_w;
int64_t output_h;
int64_t output_c;
int64_t input_w;
int64_t input_h;
int64_t input_c;
int64_t i_start_x;
int64_t i_start_y;
int64_t i_start_z;
int64_t o_start_x;
int64_t o_start_y;
int64_t o_start_z;
int64_t pad_l;
int64_t pad_t;
int64_t pad_f;
int64_t parallelSliceNum;
int64_t num_elem;
int64_t input_dim;
uint32_t PadV3GradCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t PadV3GradCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t PadV3GradCompute1D(T *input, T *output, int64_t p);
template <typename T>
uint32_t PadV3GradCompute2D(T *input, T *output, int64_t p, int64_t i);
template <typename T>
uint32_t PadV3GradCompute3D(T *input, T *output, int64_t p, int64_t z);
template <typename T>
uint32_t PadV3GradCompute1(T *input, T *output, int64_t p);
int64_t IndexCaculate(int64_t pad_value, int64_t now, int64_t output_value, int64_t o_start, int64_t i_start);
template <typename T>
uint32_t PadV3ReadPaddingsAndSetOutputShape1(CpuKernelContext &ctx);
template <typename T>
uint32_t PadV3ReadPaddingsAndSetOutputShape2(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -75,8 +75,6 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
mindspore::kFillOpName,
mindspore::kLogMatrixDeterminantOpName,
mindspore::kMatrixSolveLsOpName,
mindspore::kMaskedSelectOpName,
mindspore::kMaskedSelectGradOpName,
mindspore::kMedianOpName,
mindspore::kACosGradOpName,
mindspore::kAcoshGradOpName,
@ -244,7 +242,11 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
mindspore::kLuUnpackOpName,
mindspore::kLuUnpackGradOpName,
mindspore::kMatMulOpName,
mindspore::kMatrixExpOpName};
mindspore::kMatrixExpOpName,
mindspore::kPadV3GradOpName,
mindspore::kPadV3OpName,
mindspore::kLogicalXorOpName,
mindspore::kLogNormalReverseOpName};
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";

View File

@ -340,3 +340,5 @@ from .lstsq import _lstsq_aicpu
from .lu_unpack import _lu_unpack_aicpu
from .lu_unpack_grad import _lu_unpack_grad_aicpu
from .matrix_exp import _matrix_exp_aicpu
from .pad_v3_grad import _pad_v3_grad_aicpu
from .pad_v3 import _pad_v3_aicpu

View File

@ -765,8 +765,12 @@ def resize_nearest_neighbor(input_x, size, align_corners=False):
``Ascend`` ``GPU`` ``CPU``
Examples:
>>> import numpy as np
>>> import mindspore
>>> from mindspore import Tensor, ops
>>> input_tensor = Tensor(np.array([[[[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]]]), mindspore.float32)
>>> output = ops.ResizeNearestNeighbor(input_tensor, (2, 2))
>>> size = (2, 2)
>>> output = ops.ResizeNearestNeighbor(size=size)(input_tensor)
>>> print(output)
[[[[-0.1 0.3]
[ 0.4 0.5]]]]

View File

@ -3744,9 +3744,12 @@ class ResizeNearestNeighbor(Primitive):
``Ascend`` ``GPU`` ``CPU``
Examples:
>>> import numpy as np
>>> import mindspore
>>> from mindspore import Tensor, ops
>>> input_tensor = Tensor(np.array([[[[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]]]), mindspore.float32)
>>> resize = ops.ResizeNearestNeighbor((2, 2))
>>> output = resize(input_tensor)
>>> size = (2, 2)
>>> output = ops.ResizeNearestNeighbor(size=size)(input_tensor)
>>> print(output)
[[[[-0.1 0.3]
[ 0.4 0.5]]]]