forked from mindspore-Ecosystem/mindspore
!48020 migrate PadV3 and other aicpu ops
Merge pull request !48020 from 李林杰/0118_block_apicpu_ops_that_might_have_issues
This commit is contained in:
commit
77442c99a2
|
@ -539,6 +539,8 @@ constexpr auto kPadAndShiftOpName = "PadAndShift";
|
|||
constexpr auto kPaddingOpName = "Padding";
|
||||
constexpr auto kPadOpName = "Pad";
|
||||
constexpr auto kPadDOpName = "PadD";
|
||||
constexpr auto kPadV3GradOpName = "PadV3Grad";
|
||||
constexpr auto kPadV3OpName = "PadV3";
|
||||
constexpr auto kParallelResizeBilinearOpName = "ParallelResizeBilinear";
|
||||
constexpr auto kSyncResizeBilinearV2OpName = "SyncResizeBilinearV2";
|
||||
constexpr auto kParallelResizeBilinearGradOpName = "ParallelResizeBilinearGrad";
|
||||
|
|
|
@ -0,0 +1,198 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "logical_xor.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kLogicalXor = "LogicalXor";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t LogicalXorCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LogicalXor check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(LogicalXorCheck(ctx), "LogicalXor check params or bcast failed.");
|
||||
uint32_t result = LogicalXorCompute<bool>(ctx);
|
||||
if (result != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("LogicalXor kernel compute failed.");
|
||||
return result;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t LogicalXorCpuKernel::LogicalXorCheck(CpuKernelContext &ctx) {
|
||||
// the non null of input_0, input_1, output has been verified in NormalCheck
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type && input0_type == DT_BOOL), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s] and both should be bool.",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"LogicalXorCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
/**
|
||||
* special compute is used in the following situations.
|
||||
* 1. the shapes of input1 and input2 are the same
|
||||
* 2. input1 is a 1D tensor with only one element or input1 is scalar
|
||||
* 3. input2 is a 1D tensor with only one element or input2 is scalar
|
||||
* 4. the shapes of input1 and input2 are different
|
||||
*/
|
||||
template <typename T>
|
||||
void LogicalXorCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
|
||||
const T *input2, bool *output) {
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(output + i) = *(input1 + i) != *(input2 + i);
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(output + i) = *input1 != *(input2 + i);
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(output + i) = *(input1 + i) != *input2;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogicalXorCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
|
||||
auto input_0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input_1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<bool *>(ctx.Output(0)->GetData());
|
||||
int64_t input_0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t input_1_elements_nums = ctx.Input(1)->NumElements();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
BcastShapeType type =
|
||||
input_0_elements_nums == input_1_elements_nums
|
||||
? BcastShapeType::SAME_SHAPE
|
||||
: (input_0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
|
||||
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_LogicalXor = [&](int64_t start, int64_t end) {
|
||||
SpecialCompute<T>(type, start, end, input_0, input_1, out);
|
||||
};
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_LogicalXor),
|
||||
"LogicalXor Compute failed.");
|
||||
} else {
|
||||
SpecialCompute<T>(type, 0, data_num, input_0, input_1, out);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogicalXorCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
auto input_0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto input_1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<bool *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_LogicalXor = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) =
|
||||
*(input_0 + bcast.GetBroadcastXIndex(i)) != *(input_1 + bcast.GetBroadcastYIndex(i)) ? true : false;
|
||||
}
|
||||
};
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_LogicalXor),
|
||||
"LogicalXor Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
*(out + i) = *(input_0 + bcast.GetBroadcastXIndex(i)) != *(input_1 + bcast.GetBroadcastYIndex(i)) ? true : false;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogicalXorCpuKernel::LogicalXorCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
|
||||
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
|
||||
if (isNeedBcast) {
|
||||
return NoBcastCompute<T>(ctx);
|
||||
} else {
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kLogicalXor, LogicalXorCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LOGICALXOR_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_LOGICALXOR_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class LogicalXorCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LogicalXorCpuKernel() = default;
|
||||
~LogicalXorCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
static uint32_t LogicalXorCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, bool *output);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogicalXorCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,533 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "pad_v3.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kPadV3 = "PadV3";
|
||||
constexpr int64_t kMinCoreNum = 1;
|
||||
constexpr int64_t kParallelNum = 1024 * 16;
|
||||
constexpr int64_t kInput3D = 3;
|
||||
constexpr int64_t kInput4D = 4;
|
||||
constexpr int64_t kInput5D = 5;
|
||||
constexpr int64_t kPadding1D = 2;
|
||||
constexpr int64_t kPadding2D = 4;
|
||||
constexpr int64_t kPadding3D = 6;
|
||||
constexpr int64_t kNum2 = 2;
|
||||
constexpr int64_t kNum3 = 3;
|
||||
constexpr int64_t kNum4 = 4;
|
||||
|
||||
const std::vector<std::string> mode_list = {"constant", "reflect", "edge"};
|
||||
using float16 = Eigen::half;
|
||||
|
||||
#define PAD_V3_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = DoCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("PadV3 kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t PadV3CpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input x failed")
|
||||
KERNEL_CHECK_NULLPTR(ctx.Input(1)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input paddings failed")
|
||||
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output y failed")
|
||||
KERNEL_HANDLE_ERROR(CheckAndInitParams(ctx), "PadV3 check and init params failed.");
|
||||
auto paddings_type = ctx.Input(1)->GetDataType();
|
||||
if (paddings_type == DT_INT32) {
|
||||
KERNEL_CHECK_FALSE((GetPaddingsAndSetOuputShape<int32_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Get paddings and set output shape failed.");
|
||||
} else if (paddings_type == DT_INT64) {
|
||||
KERNEL_CHECK_FALSE((GetPaddingsAndSetOuputShape<int64_t>(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Get paddings and set output shape failed.");
|
||||
} else {
|
||||
KERNEL_LOG_ERROR("PadV3 paddings data type [%s] not support.", DTypeStr(paddings_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
auto data_type_ = ctx.Input(0)->GetDataType();
|
||||
switch (data_type_) {
|
||||
PAD_V3_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_FLOAT16, float16, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
PAD_V3_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("PadV3 kernel data type [%s] not support.", DTypeStr(data_type_).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
int64_t PadV3CpuKernel::EdgeIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start,
|
||||
int64_t i_start) {
|
||||
int64_t ip;
|
||||
if (now < pad_value) {
|
||||
ip = pad_value;
|
||||
} else if (now >= pad_value && now < input_value + pad_value) {
|
||||
ip = now;
|
||||
} else {
|
||||
ip = input_value + pad_value - 1;
|
||||
}
|
||||
ip = ip - o_start + i_start;
|
||||
return ip;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::EdgeCompute1D(T *input, T *output, int64_t p) {
|
||||
int64_t nplane = 0;
|
||||
int64_t input_w = input_shape[kNum2];
|
||||
int64_t output_w = output_shape.end()[-1];
|
||||
int64_t pad_l = paddings[0];
|
||||
int64_t i_start_x = std::max(int64_t(0), -pad_l);
|
||||
int64_t o_start_x = std::max(int64_t(0), pad_l);
|
||||
int64_t ip_x;
|
||||
for (int64_t j = 0; j < output_w; ++j) {
|
||||
ip_x = EdgeIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
|
||||
T *dest_p = output + p * output_w * (nplane + 1) + j;
|
||||
T *src_p = input + +p * input_w * (nplane + 1) + ip_x;
|
||||
*dest_p = *src_p;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::EdgeCompute2D(T *input, T *output, int64_t p) {
|
||||
int64_t pad_l = paddings[0];
|
||||
int64_t pad_t = paddings[kNum2];
|
||||
int64_t nplane = 0;
|
||||
int64_t input_h = input_shape[kNum2];
|
||||
int64_t input_w = input_shape[kNum3];
|
||||
int64_t output_h = input_h + pad_t + paddings[kNum3];
|
||||
int64_t output_w = input_w + pad_l + paddings[1];
|
||||
int64_t i_start_x = std::max(int64_t(0), -pad_l);
|
||||
int64_t i_start_y = std::max(int64_t(0), -pad_t);
|
||||
int64_t o_start_x = std::max(int64_t(0), pad_l);
|
||||
int64_t o_start_y = std::max(int64_t(0), pad_t);
|
||||
int64_t ip_x, ip_y;
|
||||
for (int64_t i = 0; i < output_h; ++i) {
|
||||
for (int64_t j = 0; j < output_w; ++j) {
|
||||
ip_x = EdgeIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
|
||||
ip_y = EdgeIndexCaculate(pad_t, i, input_h, o_start_y, i_start_y);
|
||||
T *dest_p = output + p * output_w * output_h * (nplane + 1) + i * output_w + j;
|
||||
T *src_p = input + p * input_w * input_h * (nplane + 1) + ip_y * input_w + ip_x;
|
||||
*dest_p = *src_p;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::EdgeCompute3D(T *input, T *output, int64_t p) {
|
||||
int64_t pad_l = paddings[0];
|
||||
int64_t pad_t = paddings[kNum2];
|
||||
int64_t pad_f = paddings[kNum4];
|
||||
int64_t nplane = 0;
|
||||
int64_t input_d = input_shape[kNum2];
|
||||
int64_t input_h = input_shape[kNum3];
|
||||
int64_t input_w = input_shape[kNum4];
|
||||
int64_t output_d = output_shape[kNum2];
|
||||
int64_t output_h = output_shape[kNum3];
|
||||
int64_t output_w = output_shape[kNum4];
|
||||
int64_t i_start_x = std::max(int64_t(0), -pad_l);
|
||||
int64_t i_start_y = std::max(int64_t(0), -pad_t);
|
||||
int64_t i_start_z = std::max(int64_t(0), -pad_f);
|
||||
int64_t o_start_x = std::max(int64_t(0), pad_l);
|
||||
int64_t o_start_y = std::max(int64_t(0), pad_t);
|
||||
int64_t o_start_z = std::max(int64_t(0), pad_f);
|
||||
int64_t ip_x, ip_y, ip_z;
|
||||
for (int64_t k = 0; k < output_d; ++k) {
|
||||
for (int64_t j = 0; j < output_h; ++j) {
|
||||
for (int64_t i = 0; i < output_w; ++i) {
|
||||
ip_x = EdgeIndexCaculate(pad_l, i, input_w, o_start_x, i_start_x);
|
||||
ip_y = EdgeIndexCaculate(pad_t, j, input_h, o_start_y, i_start_y);
|
||||
ip_z = EdgeIndexCaculate(pad_f, k, input_d, o_start_z, i_start_z);
|
||||
T *dest_p =
|
||||
output + p * output_w * output_h * output_d * (nplane + 1) + k * output_w * output_h + j * output_w + i;
|
||||
T *src_p =
|
||||
input + p * input_w * input_h * input_d * (nplane + 1) + ip_z * input_w * input_h + ip_y * input_w + ip_x;
|
||||
*dest_p = *src_p;
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::EdgeModeCompute(CpuKernelContext &ctx, int64_t p) {
|
||||
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
if (paddings_num == kPadding1D) {
|
||||
EdgeCompute1D<T>(input, output, p);
|
||||
} else if (paddings_num == kPadding2D) {
|
||||
EdgeCompute2D<T>(input, output, p);
|
||||
} else if (paddings_num == kPadding3D) {
|
||||
EdgeCompute3D<T>(input, output, p);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
int64_t PadV3CpuKernel::ReflectIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start,
|
||||
int64_t i_start) {
|
||||
int64_t ip;
|
||||
if (now < pad_value) {
|
||||
ip = pad_value + pad_value - now;
|
||||
} else if (now >= pad_value && now < input_value + pad_value) {
|
||||
ip = now;
|
||||
} else {
|
||||
ip = (input_value + pad_value - 1) + (input_value + pad_value - 1) - now;
|
||||
}
|
||||
ip = ip - o_start + i_start;
|
||||
return ip;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::ReflectCompute1D(T *input, T *output, int64_t p) {
|
||||
int64_t nplane = 0;
|
||||
int64_t input_w = input_shape[kNum2];
|
||||
int64_t output_w = output_shape.end()[-1];
|
||||
int64_t pad_l = paddings[0];
|
||||
int64_t i_start_x = std::max(int64_t(0), -pad_l);
|
||||
int64_t o_start_x = std::max(int64_t(0), pad_l);
|
||||
int64_t ip_x;
|
||||
for (int64_t j = 0; j < output_w; ++j) {
|
||||
ip_x = ReflectIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
|
||||
T *dest_p = output + p * output_w * (nplane + 1) + j;
|
||||
T *src_p = input + +p * input_w * (nplane + 1) + ip_x;
|
||||
*dest_p = *src_p;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::ReflectCompute2D(T *input, T *output, int64_t p) {
|
||||
int64_t pad_l = paddings[0];
|
||||
int64_t pad_t = paddings[kNum2];
|
||||
int64_t nplane = 0;
|
||||
int64_t input_h = input_shape[kNum2];
|
||||
int64_t input_w = input_shape[kNum3];
|
||||
int64_t output_h = input_h + pad_t + paddings[kNum3];
|
||||
int64_t output_w = input_w + pad_l + paddings[1];
|
||||
int64_t i_start_x = std::max(int64_t(0), -pad_l);
|
||||
int64_t i_start_y = std::max(int64_t(0), -pad_t);
|
||||
int64_t o_start_x = std::max(int64_t(0), pad_l);
|
||||
int64_t o_start_y = std::max(int64_t(0), pad_t);
|
||||
int64_t ip_x, ip_y;
|
||||
for (int64_t i = 0; i < output_h; ++i) {
|
||||
for (int64_t j = 0; j < output_w; ++j) {
|
||||
ip_x = ReflectIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x);
|
||||
ip_y = ReflectIndexCaculate(pad_t, i, input_h, o_start_y, i_start_y);
|
||||
T *dest_p = output + p * output_w * output_h * (nplane + 1) + i * output_w + j;
|
||||
T *src_p = input + p * input_w * input_h * (nplane + 1) + ip_y * input_w + ip_x;
|
||||
*dest_p = *src_p;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::ReflectCompute3D(T *input, T *output, int64_t p) {
|
||||
int64_t pad_l = paddings[0];
|
||||
int64_t pad_t = paddings[kNum2];
|
||||
int64_t pad_f = paddings[kNum4];
|
||||
int64_t nplane = 0;
|
||||
int64_t input_d = input_shape[kNum2];
|
||||
int64_t input_h = input_shape[kNum3];
|
||||
int64_t input_w = input_shape[kNum4];
|
||||
int64_t output_d = output_shape[kNum2];
|
||||
int64_t output_h = output_shape[kNum3];
|
||||
int64_t output_w = output_shape[kNum4];
|
||||
int64_t i_start_x = std::max(int64_t(0), -pad_l);
|
||||
int64_t i_start_y = std::max(int64_t(0), -pad_t);
|
||||
int64_t i_start_z = std::max(int64_t(0), -pad_f);
|
||||
int64_t o_start_x = std::max(int64_t(0), pad_l);
|
||||
int64_t o_start_y = std::max(int64_t(0), pad_t);
|
||||
int64_t o_start_z = std::max(int64_t(0), pad_f);
|
||||
int64_t ip_x, ip_y, ip_z;
|
||||
for (int64_t k = 0; k < output_d; ++k) {
|
||||
for (int64_t j = 0; j < output_h; ++j) {
|
||||
for (int64_t i = 0; i < output_w; ++i) {
|
||||
ip_x = ReflectIndexCaculate(pad_l, i, input_w, o_start_x, i_start_x);
|
||||
ip_y = ReflectIndexCaculate(pad_t, j, input_h, o_start_y, i_start_y);
|
||||
ip_z = ReflectIndexCaculate(pad_f, k, input_d, o_start_z, i_start_z);
|
||||
T *dest_p =
|
||||
output + p * output_w * output_h * output_d * (nplane + 1) + k * output_w * output_h + j * output_w + i;
|
||||
T *src_p =
|
||||
input + p * input_w * input_h * input_d * (nplane + 1) + ip_z * input_w * input_h + ip_y * input_w + ip_x;
|
||||
*dest_p = *src_p;
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::ReflectModeCompute(CpuKernelContext &ctx, int64_t p) {
|
||||
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
if (paddings_num == kPadding1D) {
|
||||
ReflectCompute1D<T>(input, output, p);
|
||||
} else if (paddings_num == kPadding2D) {
|
||||
ReflectCompute2D<T>(input, output, p);
|
||||
} else if (paddings_num == kPadding3D) {
|
||||
ReflectCompute3D<T>(input, output, p);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::ConstantModeCompute(CpuKernelContext &ctx, T constant_values) {
|
||||
auto input_ptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_ptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t output_num = ctx.Output(0)->NumElements();
|
||||
int64_t input_num = 1;
|
||||
std::vector<int64_t> input_strides(input_dims, 0);
|
||||
std::vector<int64_t> output_strides(input_dims, 0);
|
||||
input_strides[input_dims - 1] = 1;
|
||||
output_strides[input_dims - 1] = 1;
|
||||
for (int64_t i = input_dims - 1; i >= 1; --i) {
|
||||
input_strides[i - 1] = input_strides[i] * input_shape[i];
|
||||
output_strides[i - 1] = output_strides[i] * output_shape[i];
|
||||
}
|
||||
std::vector<int64_t> offsets(input_dims, 0);
|
||||
std::vector<int64_t> extents(input_dims, 0);
|
||||
for (int64_t i = input_dims - 1; i >= 0; --i) {
|
||||
extents[i] = input_shape[i];
|
||||
if (paddings[i * kNum2] < 0) {
|
||||
extents[i] += paddings[i * kNum2];
|
||||
offsets[i] = -paddings[i * kNum2];
|
||||
paddings[i * kNum2] = 0;
|
||||
}
|
||||
if (paddings[i * kNum2 + 1] < 0) {
|
||||
extents[i] += paddings[i * kNum2 + 1];
|
||||
paddings[i * kNum2 + 1] = 0;
|
||||
}
|
||||
input_shape[i] = extents[i];
|
||||
input_num *= input_shape[i];
|
||||
}
|
||||
std::vector<T> input_values;
|
||||
for (int64_t i = 0; i < input_num; ++i) {
|
||||
int64_t k = i;
|
||||
int64_t p = 0;
|
||||
for (int64_t j = input_dims - 1; j >= 0; --j) {
|
||||
p += (offsets[j] + (k % extents[j])) * input_strides[j];
|
||||
k /= extents[j];
|
||||
}
|
||||
input_values.push_back(*(input_ptr + p));
|
||||
}
|
||||
for (int64_t i = 0; i < output_num; ++i) {
|
||||
*(output_ptr + i) = constant_values;
|
||||
}
|
||||
if (input_dims == 1) {
|
||||
for (int64_t i = 0; i < input_num; ++i) {
|
||||
*(output_ptr + paddings[0] + i) = input_values[i];
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
std::vector<int64_t> i_inx_add(input_dims, 0);
|
||||
std::vector<int64_t> o_inx_add(input_dims, 0);
|
||||
i_inx_add[input_dims - 1] = output_strides[input_dims - 1] * paddings[kNum2 * (input_dims - 1)];
|
||||
o_inx_add[input_dims - 1] = output_strides[input_dims - 1] * paddings[kNum2 * (input_dims - 1) + 1];
|
||||
for (int64_t i = input_dims - 1; i >= 1; --i) {
|
||||
i_inx_add[i - 1] = i_inx_add[i] + output_strides[i - 1] * paddings[kNum2 * (i - 1)];
|
||||
o_inx_add[i - 1] = o_inx_add[i] + output_strides[i - 1] * paddings[kNum2 * (i - 1) + 1];
|
||||
}
|
||||
int64_t i_inx = 0;
|
||||
int64_t o_inx = i_inx_add[0];
|
||||
std::vector<int64_t> pos(input_dims - 1, 0);
|
||||
while (i_inx < input_num) {
|
||||
for (int64_t i = 0; i < input_shape[input_dims - 1]; ++i) {
|
||||
*(output_ptr + o_inx + i) = input_values[i_inx + i];
|
||||
}
|
||||
pos[input_dims - kNum2] += 1;
|
||||
int64_t dep = input_dims - 1;
|
||||
for (int64_t i = input_dims - kNum2; i >= 0; --i) {
|
||||
if (i > 0 && pos[i] >= input_shape[i]) {
|
||||
pos[i] -= input_shape[i];
|
||||
pos[i - 1] += 1;
|
||||
dep = i;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
o_inx += i_inx_add[dep] + o_inx_add[dep] + input_shape[input_dims - 1];
|
||||
i_inx += input_shape[input_dims - 1];
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
if (mode == "constant") {
|
||||
T constant_values = static_cast<T>(0);
|
||||
if (ctx.Input(kNum2) != nullptr) {
|
||||
constant_values = *(reinterpret_cast<T *>(ctx.Input(kNum2)->GetData()));
|
||||
} else {
|
||||
KERNEL_LOG_DEBUG("Get attr [constant_values] failed, use default value [0]");
|
||||
}
|
||||
for (int64_t i = 0; i < input_dims / kNum2; ++i) {
|
||||
int64_t u = paddings[i * kNum2];
|
||||
int64_t v = paddings[i * kNum2 + 1];
|
||||
paddings[i * kNum2] = paddings[kNum2 * (input_dims - i - 1)];
|
||||
paddings[i * kNum2 + 1] = paddings[kNum2 * (input_dims - i - 1) + 1];
|
||||
paddings[kNum2 * (input_dims - i - 1)] = u;
|
||||
paddings[kNum2 * (input_dims - i - 1) + 1] = v;
|
||||
}
|
||||
ConstantModeCompute<T>(ctx, constant_values);
|
||||
} else if (mode == "reflect") {
|
||||
auto shard_padv3_reflcet = [&](int64_t start, int64_t end) {
|
||||
for (int p = start; p < end; p++) {
|
||||
ReflectModeCompute<T>(ctx, p);
|
||||
}
|
||||
};
|
||||
const int64_t data_num = parallelSliceNum;
|
||||
const bool enable_parallel = data_num > kParallelNum;
|
||||
if (enable_parallel) {
|
||||
const int64_t max_core_num =
|
||||
std::max(static_cast<int64_t>(kMinCoreNum), static_cast<int64_t>(aicpu::CpuKernelUtils::GetCPUNum(ctx)));
|
||||
const int64_t per_unit_size = data_num / std::min(data_num, max_core_num);
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, per_unit_size, shard_padv3_reflcet),
|
||||
"PadV3 Compute failed.");
|
||||
} else {
|
||||
shard_padv3_reflcet(0, data_num);
|
||||
}
|
||||
} else if (mode == "edge") {
|
||||
auto shard_padv3_edge = [&](int64_t start, int64_t end) {
|
||||
for (int p = start; p < end; p++) {
|
||||
EdgeModeCompute<T>(ctx, p);
|
||||
}
|
||||
};
|
||||
const int64_t data_num = parallelSliceNum;
|
||||
const bool enable_parallel = data_num > kParallelNum;
|
||||
if (enable_parallel) {
|
||||
const int64_t max_core_num =
|
||||
std::max(static_cast<int64_t>(kMinCoreNum), static_cast<int64_t>(aicpu::CpuKernelUtils::GetCPUNum(ctx)));
|
||||
const int64_t per_unit_size = data_num / std::min(data_num, max_core_num);
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, per_unit_size, shard_padv3_edge),
|
||||
"PadV3 Compute failed.");
|
||||
} else {
|
||||
shard_padv3_edge(0, data_num);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t PadV3CpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
|
||||
if (ctx.GetAttr("mode") == nullptr) {
|
||||
mode = "constant";
|
||||
KERNEL_LOG_DEBUG("Get attr [mode] failed, use default value [constant]");
|
||||
} else {
|
||||
mode = ctx.GetAttr("mode")->GetString();
|
||||
const bool is_mode_available = std::find(mode_list.begin(), mode_list.end(), mode) != mode_list.end();
|
||||
if (is_mode_available == false) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"Attr [mode] must be included in [constant, reflect, edge], but got "
|
||||
"[%s]",
|
||||
mode.c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
if (ctx.GetAttr("paddings_contiguous") != nullptr) {
|
||||
paddings_contiguous = ctx.GetAttr("paddings_contiguous")->GetBool();
|
||||
} else {
|
||||
paddings_contiguous = true;
|
||||
KERNEL_LOG_DEBUG("Get attr [paddings_contiguous] failed, use default value [true]");
|
||||
}
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("Tensor y dtype[%s] must be same with x dtype[%s]", DTypeStr(ctx.Output(0)->GetDataType()).c_str(),
|
||||
DTypeStr(ctx.Input(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
input_dims = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
const std::vector<int64_t> paddings_shape = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
paddings_num = ctx.Input(1)->NumElements();
|
||||
KERNEL_CHECK_FALSE(paddings_shape.size() == 1 && paddings_num == input_dims * kNum2, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Paddings shape is not supported");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3CpuKernel::GetPaddingsAndSetOuputShape(CpuKernelContext &ctx) {
|
||||
auto paddings_ptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
paddings = std::vector<int64_t>(input_dims * kNum2, 0);
|
||||
for (int64_t i = 0; i < paddings_num; i += kNum2) {
|
||||
paddings[i] = static_cast<int64_t>(paddings_ptr[paddings_num - i - kNum2]);
|
||||
paddings[i + 1] = static_cast<int64_t>(paddings_ptr[paddings_num - i - 1]);
|
||||
}
|
||||
if (mode == "edge" || mode == "reflect" || (mode == "constant" && paddings_contiguous == false)) {
|
||||
paddings_num = paddings_num - kNum4;
|
||||
}
|
||||
if (paddings_contiguous == false) {
|
||||
std::vector<int64_t> tmp = paddings;
|
||||
for (int64_t i = 0; i < paddings_num; ++i) {
|
||||
if (i % kNum2 == 0) {
|
||||
paddings[i] = tmp[i / kNum2];
|
||||
} else {
|
||||
paddings[i] = tmp[(i + paddings_num) / kNum2];
|
||||
}
|
||||
}
|
||||
}
|
||||
input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
parallelSliceNum = 1;
|
||||
for (int64_t i = 0; i < input_dims - paddings_num / kNum2; ++i) {
|
||||
parallelSliceNum *= input_shape[i];
|
||||
}
|
||||
for (int64_t i = 0; i < paddings_num / kNum2; ++i) {
|
||||
output_shape.end()[-(i + 1)] += (paddings[i * kNum2] + paddings[i * kNum2 + 1]);
|
||||
KERNEL_CHECK_FALSE(output_shape.end()[-(i + 1)] > 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"output_shape number must be greater than 0");
|
||||
KERNEL_CHECK_FALSE(input_shape.end()[-(i + 1)] >= std::max(-paddings[i * kNum2], -paddings[i * kNum2 + 1]),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Padding size should be less than the corresponding input dimension");
|
||||
if (mode == "reflect") {
|
||||
KERNEL_CHECK_FALSE(input_shape.end()[-(i + 1)] > std::max(paddings[i * kNum2], paddings[i * kNum2 + 1]),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Padding size should be less than the corresponding input dimension");
|
||||
}
|
||||
}
|
||||
if (output_shape != ctx.Output(0)->GetTensorShape()->GetDimSizes()) {
|
||||
ctx.Output(0)->GetTensorShape()->SetDimSizes(output_shape);
|
||||
KERNEL_LOG_DEBUG("Set output tensor shape success, num elements:[%llu]",
|
||||
static_cast<uint64_t>(ctx.Output(0)->NumElements()));
|
||||
} else {
|
||||
KERNEL_LOG_DEBUG("Output tensor is a const tensor, num elements:[%llu]",
|
||||
static_cast<uint64_t>(ctx.Output(0)->NumElements()));
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kPadV3, PadV3CpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,89 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_PAD_V3_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_PAD_V3_H_
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "securec.h"
|
||||
#include "status.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class PadV3CpuKernel : public CpuKernel {
|
||||
public:
|
||||
PadV3CpuKernel() = default;
|
||||
~PadV3CpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
std::vector<int64_t> paddings;
|
||||
std::vector<int64_t> input_shape;
|
||||
std::vector<int64_t> output_shape;
|
||||
std::string mode;
|
||||
bool paddings_contiguous;
|
||||
int64_t input_dims{0};
|
||||
int64_t paddings_num{0};
|
||||
int64_t parallelSliceNum{1};
|
||||
|
||||
uint32_t CheckAndInitParams(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t GetPaddingsAndSetOuputShape(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t EdgeModeCompute(CpuKernelContext &ctx, int64_t p);
|
||||
|
||||
template <typename T>
|
||||
uint32_t EdgeCompute3D(T *input, T *output, int64_t p);
|
||||
|
||||
template <typename T>
|
||||
uint32_t EdgeCompute2D(T *input, T *output, int64_t p);
|
||||
|
||||
template <typename T>
|
||||
uint32_t EdgeCompute1D(T *input, T *output, int64_t p);
|
||||
|
||||
int64_t EdgeIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start, int64_t i_start);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ReflectModeCompute(CpuKernelContext &ctx, int64_t p);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ReflectCompute3D(T *input, T *output, int64_t p);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ReflectCompute2D(T *input, T *output, int64_t p);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ReflectCompute1D(T *input, T *output, int64_t p);
|
||||
|
||||
int64_t ReflectIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start, int64_t i_start);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ConstantModeCompute(CpuKernelContext &ctx, T constant_values);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,367 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "pad_v3_grad.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "securec.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kPadV3Grad = "PadV3Grad";
|
||||
constexpr uint32_t kInputNum = 2;
|
||||
constexpr uint32_t kOutputNum = 1;
|
||||
constexpr int64_t kParallelNum = 1024 * 64;
|
||||
const int64_t k3DNum = 6;
|
||||
const int64_t k2DNum = 4;
|
||||
const int64_t k1DNum = 2;
|
||||
constexpr int64_t kpad_l = 0;
|
||||
constexpr int64_t kpad_t = 2;
|
||||
constexpr int64_t kpad_f = 4;
|
||||
constexpr int64_t kwidth = 1;
|
||||
constexpr int64_t kheight = 2;
|
||||
constexpr int64_t kchannel = 3;
|
||||
constexpr int64_t kInput1Dim = 3;
|
||||
constexpr int64_t kInput2Dim = 4;
|
||||
constexpr int64_t kInput3Dim = 5;
|
||||
constexpr int64_t k2Num = 2;
|
||||
constexpr int64_t k3Num = 3;
|
||||
constexpr int64_t k4Num = 4;
|
||||
|
||||
const std::vector<std::string> mode_list = {"reflect", "edge"};
|
||||
using float16 = Eigen::half;
|
||||
|
||||
#define PAD_V3_GRAD_READ_PADDINGS(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result1 = PadV3ReadPaddingsAndSetOutputShape1<TYPE>(CTX); \
|
||||
uint32_t result2 = PadV3ReadPaddingsAndSetOutputShape2<TYPE>(CTX); \
|
||||
if (result1 != KERNEL_STATUS_OK || result2 != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("PadV3Grad kernel compute failed."); \
|
||||
return result1 && result2; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define PAD_V3_GRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = PadV3GradCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("PadV3Grad kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t PadV3GradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(PadV3GradCheck(ctx), "PadV3Grad check params failed.");
|
||||
auto paddings_type = ctx.Input(1)->GetDataType();
|
||||
switch (paddings_type) {
|
||||
PAD_V3_GRAD_READ_PADDINGS(DT_INT32, int32_t, ctx)
|
||||
PAD_V3_GRAD_READ_PADDINGS(DT_INT64, int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("PadV3Grad paddings data type [%s] not support.", DTypeStr(paddings_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
auto data_type = ctx.Output(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_FLOAT16, float16, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
PAD_V3_GRAD_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("PadV3Grad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t PadV3GradCpuKernel::PadV3GradCheck(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "PadV3Grad check failed.");
|
||||
if (ctx.GetAttr("paddings_contiguous") == nullptr) {
|
||||
padding_contiguous = true;
|
||||
KERNEL_LOG_DEBUG("Get attr [paddings_contiguous] failed, use default value [true]");
|
||||
} else {
|
||||
padding_contiguous = ctx.GetAttr("paddings_contiguous")->GetBool();
|
||||
}
|
||||
if (ctx.GetAttr("mode") == nullptr) {
|
||||
mode = "reflect";
|
||||
KERNEL_LOG_DEBUG("Get attr [mode] failed, use default value [reflect]");
|
||||
} else {
|
||||
mode = ctx.GetAttr("mode")->GetString();
|
||||
const bool is_mode_available = std::find(mode_list.begin(), mode_list.end(), mode) != mode_list.end();
|
||||
if (is_mode_available == false) {
|
||||
KERNEL_LOG_ERROR("Attr [mode] must be included in [reflect, edge], but got [%s]", mode.c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("Tensor y dtype[%s] must be same with x dtype[%s]", DTypeStr(ctx.Output(0)->GetDataType()).c_str(),
|
||||
DTypeStr(ctx.Input(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
const std::vector<int64_t> paddings_shape = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE(
|
||||
paddings_shape.size() == 1 && (paddings_shape[0] == k3DNum + k4Num || paddings_shape[0] == k2DNum + k4Num ||
|
||||
paddings_shape[0] == k1DNum + k4Num || paddings_shape[0] == 1),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Paddings shape is not supported");
|
||||
KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() >= kInput1Dim, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Dims of tensor x should be greater than or equal to 3");
|
||||
KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() <= kInput3Dim, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Only 3D, 4D, 5D padding with non-constant padding are "
|
||||
"supported for now");
|
||||
|
||||
const int64_t input_dim = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
const int64_t num_elem = ctx.Input(1)->NumElements();
|
||||
KERNEL_CHECK_FALSE(num_elem % k2Num == 0 || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Padding length must be divisible by 2");
|
||||
|
||||
if (input_dim == kInput1Dim) {
|
||||
KERNEL_CHECK_FALSE(num_elem == k1DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
|
||||
"3D tensors expect 6 values for padding");
|
||||
} else if (input_dim == kInput2Dim) {
|
||||
KERNEL_CHECK_FALSE(num_elem == k2DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
|
||||
"4D tensors expect 8 values for padding");
|
||||
} else if (input_dim == kInput3Dim) {
|
||||
KERNEL_CHECK_FALSE(num_elem == k3DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID,
|
||||
"5D tensors expect 10 values for padding");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCpuKernel::PadV3ReadPaddingsAndSetOutputShape1(CpuKernelContext &ctx) {
|
||||
num_elem = ctx.Input(1)->NumElements();
|
||||
input_dim = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
const std::vector<int64_t> input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
auto paddings_ptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
paddings = std::vector<int64_t>(input_dim * k2Num, 0);
|
||||
|
||||
for (int64_t i = 0; i < num_elem; i += k2Num) {
|
||||
paddings[i] = static_cast<int64_t>(paddings_ptr[num_elem - i - k2Num]);
|
||||
paddings[i + 1] = static_cast<int64_t>(paddings_ptr[num_elem - i - 1]);
|
||||
}
|
||||
num_elem = num_elem - k4Num;
|
||||
if (num_elem == 1) {
|
||||
num_elem = k2Num * (input_dim - k2Num);
|
||||
for (int64_t i = 0; i < k2Num * (input_dim - k2Num); ++i) {
|
||||
paddings[i] = static_cast<int64_t>(paddings_ptr[0]);
|
||||
}
|
||||
}
|
||||
|
||||
parallelSliceNum = 1;
|
||||
for (int64_t i = 0; i < input_dim - num_elem / k2Num; i++) {
|
||||
parallelSliceNum *= input_shape[i];
|
||||
}
|
||||
|
||||
if (padding_contiguous == false && num_elem == k3DNum) {
|
||||
std::vector<int64_t> tmp = paddings;
|
||||
paddings[1] = tmp[k3Num];
|
||||
paddings[k2Num] = tmp[1];
|
||||
paddings[k3Num] = tmp[k4Num];
|
||||
paddings[k4Num] = tmp[k2Num];
|
||||
}
|
||||
|
||||
if (padding_contiguous == false && num_elem == k2DNum) {
|
||||
std::vector<int64_t> tmp = paddings;
|
||||
paddings[1] = tmp[k2Num];
|
||||
paddings[k2Num] = tmp[1];
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCpuKernel::PadV3ReadPaddingsAndSetOutputShape2(CpuKernelContext &ctx) {
|
||||
std::vector<int64_t> output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
output_shape.end()[-kwidth] -= (paddings[kpad_l] + paddings[kpad_l + 1]);
|
||||
output_shape.end()[-kheight] -= (paddings[kpad_t] + paddings[kpad_t + 1]);
|
||||
output_shape.end()[-kchannel] -= (paddings[kpad_f] + paddings[kpad_f + 1]);
|
||||
|
||||
KERNEL_CHECK_FALSE(
|
||||
output_shape.end()[-kwidth] > 0 && output_shape.end()[-kheight] > 0 && output_shape.end()[-kchannel] > 0,
|
||||
KERNEL_STATUS_PARAM_INVALID, "output_shape number must be greater than 0");
|
||||
|
||||
if (output_shape != ctx.Output(0)->GetTensorShape()->GetDimSizes()) {
|
||||
ctx.Output(0)->GetTensorShape()->SetDimSizes(output_shape);
|
||||
KERNEL_LOG_DEBUG("Set output tensor shape success, num elements:[%llu]",
|
||||
static_cast<uint64_t>(ctx.Output(0)->NumElements()));
|
||||
} else {
|
||||
KERNEL_LOG_DEBUG("Output tensor is a const tensor, num elements:[%llu]",
|
||||
static_cast<uint64_t>(ctx.Output(0)->NumElements()));
|
||||
}
|
||||
const std::string padding_contiguous_str = padding_contiguous ? std::string("True") : std::string("False");
|
||||
KERNEL_LOG_DEBUG(
|
||||
"PadV3GradCpuKernel[%s], x: size[%llu] dtype[%s], "
|
||||
"paddings: size[%llu] dtype[%s], y: size[%llu] dtype[%s], mode: [%s], "
|
||||
"padding_contiguous: [%s].",
|
||||
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), DTypeStr(ctx.Input(0)->GetDataType()).c_str(),
|
||||
ctx.Input(1)->GetDataSize(), DTypeStr(ctx.Input(1)->GetDataType()).c_str(), ctx.Output(0)->GetDataSize(),
|
||||
DTypeStr(ctx.Output(0)->GetDataType()).c_str(), mode.c_str(), padding_contiguous_str.c_str());
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
int64_t PadV3GradCpuKernel::IndexCaculate(int64_t pad_value, int64_t now, int64_t output_value, int64_t o_start,
|
||||
int64_t i_start) {
|
||||
int64_t ip = 0;
|
||||
if (now < pad_value) {
|
||||
if (mode == "reflect") {
|
||||
ip = pad_value + pad_value - now;
|
||||
} else if (mode == "edge") {
|
||||
ip = pad_value;
|
||||
}
|
||||
} else if (now >= pad_value && now < output_value + pad_value) {
|
||||
ip = now;
|
||||
} else {
|
||||
if (mode == "reflect") {
|
||||
ip = (output_value + pad_value - 1) + (output_value + pad_value - 1) - now;
|
||||
} else if (mode == "edge") {
|
||||
ip = output_value + pad_value - 1;
|
||||
}
|
||||
}
|
||||
ip = ip - o_start + i_start;
|
||||
return ip;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCpuKernel::PadV3GradCompute1(T *input, T *output, int64_t p) {
|
||||
if (num_elem == k1DNum) {
|
||||
PadV3GradCompute1D<T>(input, output, p);
|
||||
} else if (num_elem == k2DNum) {
|
||||
for (int i = 0; i < input_h; i++) {
|
||||
PadV3GradCompute2D<T>(input, output, p, i);
|
||||
}
|
||||
} else if (num_elem == k3DNum) {
|
||||
for (int z = 0; z < input_c; z++) {
|
||||
PadV3GradCompute3D<T>(input, output, p, z);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCpuKernel::PadV3GradCompute1D(T *input, T *output, int64_t p) {
|
||||
int ip_x;
|
||||
for (int j = 0; j < input_w; j++) {
|
||||
ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x);
|
||||
T *src_p = input + p * input_w + j;
|
||||
T *dest_p = output + p * output_w + ip_x;
|
||||
*dest_p += *src_p;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCpuKernel::PadV3GradCompute2D(T *input, T *output, int64_t p, int64_t i) {
|
||||
int ip_x, ip_y;
|
||||
for (int j = 0; j < input_w; j++) {
|
||||
ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x);
|
||||
ip_y = IndexCaculate(pad_t, i, output_h, o_start_y, i_start_y);
|
||||
T *src_p = input + p * input_w * input_h + i * input_w + j;
|
||||
T *dest_p = output + p * output_w * output_h + ip_y * output_w + ip_x;
|
||||
*dest_p += *src_p;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCpuKernel::PadV3GradCompute3D(T *input, T *output, int64_t p, int64_t z) {
|
||||
int ip_x, ip_y, ip_z;
|
||||
for (int i = 0; i < input_h; i++) {
|
||||
for (int j = 0; j < input_w; j++) {
|
||||
ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x);
|
||||
ip_y = IndexCaculate(pad_t, i, output_h, o_start_y, i_start_y);
|
||||
ip_z = IndexCaculate(pad_f, z, output_c, o_start_z, i_start_z);
|
||||
T *src_p = input + p * input_w * input_h * input_c + z * input_w * input_h + i * input_w + j;
|
||||
T *dest_p = output + p * output_w * output_h * output_c + ip_z * output_w * output_h + ip_y * output_w + ip_x;
|
||||
*dest_p += *src_p;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCpuKernel::PadV3GradCompute(CpuKernelContext &ctx) {
|
||||
const std::vector<int64_t> input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> output_shape = ctx.Output(0)->GetTensorShape()->GetDimSizes();
|
||||
|
||||
T *input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
T *output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
|
||||
output_w = output_shape.end()[-kwidth];
|
||||
output_h = output_shape.end()[-kheight];
|
||||
output_c = output_shape.end()[-kchannel];
|
||||
input_w = input_shape.end()[-kwidth];
|
||||
input_h = input_shape.end()[-kheight];
|
||||
input_c = input_shape.end()[-kchannel];
|
||||
i_start_x = std::max(int64_t(0), -paddings[kpad_l]);
|
||||
i_start_y = std::max(int64_t(0), -paddings[kpad_t]);
|
||||
i_start_z = std::max(int64_t(0), -paddings[kpad_f]);
|
||||
o_start_x = std::max(int64_t(0), paddings[kpad_l]);
|
||||
o_start_y = std::max(int64_t(0), paddings[kpad_t]);
|
||||
o_start_z = std::max(int64_t(0), paddings[kpad_f]);
|
||||
pad_l = paddings[kpad_l];
|
||||
pad_t = paddings[kpad_t];
|
||||
pad_f = paddings[kpad_f];
|
||||
|
||||
int64_t output_num_ = 1;
|
||||
for (int64_t i = 0; i < input_dim; i++) {
|
||||
output_num_ *= output_shape[i];
|
||||
}
|
||||
auto ret = memset_s(output, sizeof(T) * output_num_, 0, sizeof(T) * output_num_);
|
||||
if (ret != EOK) {
|
||||
KERNEL_LOG_ERROR("memset_s error, ret=%d", ret);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
auto shard_padv3_grad = [&](int64_t start, int64_t end) {
|
||||
for (int p = start; p < end; p++) {
|
||||
PadV3GradCompute1<T>(input, output, p);
|
||||
}
|
||||
};
|
||||
const int64_t data_num = parallelSliceNum;
|
||||
const bool enable_parallel = parallelSliceNum > kParallelNum;
|
||||
if (enable_parallel) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_padv3_grad),
|
||||
"PadV3Grad Compute failed.");
|
||||
} else {
|
||||
for (int p = 0; p < data_num; p++) {
|
||||
PadV3GradCompute1<T>(input, output, p);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kPadV3Grad, PadV3GradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,81 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_PAD_V3_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_PAD_V3_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class PadV3GradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
PadV3GradCpuKernel() = default;
|
||||
~PadV3GradCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
bool padding_contiguous = true;
|
||||
std::string mode = "reflect";
|
||||
std::vector<int64_t> paddings;
|
||||
int64_t output_w;
|
||||
int64_t output_h;
|
||||
int64_t output_c;
|
||||
int64_t input_w;
|
||||
int64_t input_h;
|
||||
int64_t input_c;
|
||||
int64_t i_start_x;
|
||||
int64_t i_start_y;
|
||||
int64_t i_start_z;
|
||||
int64_t o_start_x;
|
||||
int64_t o_start_y;
|
||||
int64_t o_start_z;
|
||||
int64_t pad_l;
|
||||
int64_t pad_t;
|
||||
int64_t pad_f;
|
||||
int64_t parallelSliceNum;
|
||||
int64_t num_elem;
|
||||
int64_t input_dim;
|
||||
uint32_t PadV3GradCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCompute1D(T *input, T *output, int64_t p);
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCompute2D(T *input, T *output, int64_t p, int64_t i);
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCompute3D(T *input, T *output, int64_t p, int64_t z);
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3GradCompute1(T *input, T *output, int64_t p);
|
||||
|
||||
int64_t IndexCaculate(int64_t pad_value, int64_t now, int64_t output_value, int64_t o_start, int64_t i_start);
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3ReadPaddingsAndSetOutputShape1(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t PadV3ReadPaddingsAndSetOutputShape2(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -75,8 +75,6 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
|
|||
mindspore::kFillOpName,
|
||||
mindspore::kLogMatrixDeterminantOpName,
|
||||
mindspore::kMatrixSolveLsOpName,
|
||||
mindspore::kMaskedSelectOpName,
|
||||
mindspore::kMaskedSelectGradOpName,
|
||||
mindspore::kMedianOpName,
|
||||
mindspore::kACosGradOpName,
|
||||
mindspore::kAcoshGradOpName,
|
||||
|
@ -244,7 +242,11 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
|
|||
mindspore::kLuUnpackOpName,
|
||||
mindspore::kLuUnpackGradOpName,
|
||||
mindspore::kMatMulOpName,
|
||||
mindspore::kMatrixExpOpName};
|
||||
mindspore::kMatrixExpOpName,
|
||||
mindspore::kPadV3GradOpName,
|
||||
mindspore::kPadV3OpName,
|
||||
mindspore::kLogicalXorOpName,
|
||||
mindspore::kLogNormalReverseOpName};
|
||||
|
||||
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
|
||||
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";
|
||||
|
|
|
@ -340,3 +340,5 @@ from .lstsq import _lstsq_aicpu
|
|||
from .lu_unpack import _lu_unpack_aicpu
|
||||
from .lu_unpack_grad import _lu_unpack_grad_aicpu
|
||||
from .matrix_exp import _matrix_exp_aicpu
|
||||
from .pad_v3_grad import _pad_v3_grad_aicpu
|
||||
from .pad_v3 import _pad_v3_aicpu
|
||||
|
|
|
@ -765,8 +765,12 @@ def resize_nearest_neighbor(input_x, size, align_corners=False):
|
|||
``Ascend`` ``GPU`` ``CPU``
|
||||
|
||||
Examples:
|
||||
>>> import numpy as np
|
||||
>>> import mindspore
|
||||
>>> from mindspore import Tensor, ops
|
||||
>>> input_tensor = Tensor(np.array([[[[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]]]), mindspore.float32)
|
||||
>>> output = ops.ResizeNearestNeighbor(input_tensor, (2, 2))
|
||||
>>> size = (2, 2)
|
||||
>>> output = ops.ResizeNearestNeighbor(size=size)(input_tensor)
|
||||
>>> print(output)
|
||||
[[[[-0.1 0.3]
|
||||
[ 0.4 0.5]]]]
|
||||
|
|
|
@ -3744,9 +3744,12 @@ class ResizeNearestNeighbor(Primitive):
|
|||
``Ascend`` ``GPU`` ``CPU``
|
||||
|
||||
Examples:
|
||||
>>> import numpy as np
|
||||
>>> import mindspore
|
||||
>>> from mindspore import Tensor, ops
|
||||
>>> input_tensor = Tensor(np.array([[[[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]]]), mindspore.float32)
|
||||
>>> resize = ops.ResizeNearestNeighbor((2, 2))
|
||||
>>> output = resize(input_tensor)
|
||||
>>> size = (2, 2)
|
||||
>>> output = ops.ResizeNearestNeighbor(size=size)(input_tensor)
|
||||
>>> print(output)
|
||||
[[[[-0.1 0.3]
|
||||
[ 0.4 0.5]]]]
|
||||
|
|
Loading…
Reference in New Issue