From c90a952027e1c570a3e814a95fd5a0b4071e00ed Mon Sep 17 00:00:00 2001 From: lilinjie Date: Wed, 18 Jan 2023 12:01:22 +0800 Subject: [PATCH] migrate PadV3 and other ops --- mindspore/ccsrc/include/common/utils/utils.h | 2 + .../cpu_kernel/ms_kernel/logical_xor.cc | 198 +++++++ .../cpu_kernel/ms_kernel/logical_xor.h | 49 ++ .../aicpu_ops/cpu_kernel/ms_kernel/pad_v3.cc | 533 ++++++++++++++++++ .../aicpu_ops/cpu_kernel/ms_kernel/pad_v3.h | 89 +++ .../cpu_kernel/ms_kernel/pad_v3_grad.cc | 367 ++++++++++++ .../cpu_kernel/ms_kernel/pad_v3_grad.h | 81 +++ .../optimizer/mindir/aicpu_lib_select.cc | 8 +- .../mindspore/ops/_op_impl/aicpu/__init__.py | 2 + .../mindspore/ops/composite/math_ops.py | 6 +- .../mindspore/ops/operations/array_ops.py | 7 +- 11 files changed, 1336 insertions(+), 6 deletions(-) create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.h create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.h create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.h diff --git a/mindspore/ccsrc/include/common/utils/utils.h b/mindspore/ccsrc/include/common/utils/utils.h index 372866b1078..5b096be540b 100644 --- a/mindspore/ccsrc/include/common/utils/utils.h +++ b/mindspore/ccsrc/include/common/utils/utils.h @@ -512,6 +512,8 @@ constexpr auto kPadAndShiftOpName = "PadAndShift"; constexpr auto kPaddingOpName = "Padding"; constexpr auto kPadOpName = "Pad"; constexpr auto kPadDOpName = "PadD"; +constexpr auto kPadV3GradOpName = "PadV3Grad"; +constexpr auto kPadV3OpName = "PadV3"; constexpr auto kParallelResizeBilinearOpName = "ParallelResizeBilinear"; constexpr auto kSyncResizeBilinearV2OpName = "SyncResizeBilinearV2"; constexpr auto kParallelResizeBilinearGradOpName = "ParallelResizeBilinearGrad"; diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.cc new file mode 100644 index 00000000000..f378f129611 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.cc @@ -0,0 +1,198 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "logical_xor.h" +#include "cpu_kernel_utils.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t kOutputNum = 1; +const uint32_t kInputNum = 2; +const char *kLogicalXor = "LogicalXor"; +// when input data size is more than kParallelDataNum, use Parallel func +const int64_t kParallelDataNum = 2 * 1024; +const int64_t kParallelDataNumMid = 16 * 1024; +const int64_t kParallelDataNumSameShape = 7 * 1024; +const int64_t kParallelDataNumSameShapeMid = 35 * 1024; +} // namespace + +namespace aicpu { +uint32_t LogicalXorCpuKernel::Compute(CpuKernelContext &ctx) { + // check params + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LogicalXor check input and output number failed."); + KERNEL_HANDLE_ERROR(LogicalXorCheck(ctx), "LogicalXor check params or bcast failed."); + uint32_t result = LogicalXorCompute(ctx); + if (result != KERNEL_STATUS_OK) { + KERNEL_LOG_ERROR("LogicalXor kernel compute failed."); + return result; + } + return KERNEL_STATUS_OK; +} + +uint32_t LogicalXorCpuKernel::LogicalXorCheck(CpuKernelContext &ctx) { + // the non null of input_0, input_1, output has been verified in NormalCheck + Tensor *input_0 = ctx.Input(0); + Tensor *input_1 = ctx.Input(1); + Tensor *output = ctx.Output(0); + DataType input0_type = input_0->GetDataType(); + DataType input1_type = input_1->GetDataType(); + KERNEL_CHECK_FALSE((input0_type == input1_type && input0_type == DT_BOOL), KERNEL_STATUS_PARAM_INVALID, + "The data type of input0 [%s] need be same with " + "input1 [%s] and both should be bool.", + DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str()) + KERNEL_LOG_DEBUG( + "LogicalXorCpuKernel[%s], input0: size[%llu];" + "input1: size[%llu], output: size[%llu].", + ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize()); + + return KERNEL_STATUS_OK; +} + +/** + * special compute is used in the following situations. + * 1. the shapes of input1 and input2 are the same + * 2. input1 is a 1D tensor with only one element or input1 is scalar + * 3. input2 is a 1D tensor with only one element or input2 is scalar + * 4. the shapes of input1 and input2 are different + */ +template +void LogicalXorCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, + const T *input2, bool *output) { + switch (type) { + case BcastShapeType::SAME_SHAPE: + for (int64_t i = start; i < end; ++i) { + *(output + i) = *(input1 + i) != *(input2 + i); + } + break; + case BcastShapeType::X_ONE_ELEMENT: + for (int64_t i = start; i < end; ++i) { + *(output + i) = *input1 != *(input2 + i); + } + break; + case BcastShapeType::Y_ONE_ELEMENT: + for (int64_t i = start; i < end; ++i) { + *(output + i) = *(input1 + i) != *input2; + } + break; + default: + KERNEL_LOG_WARN("Invalid type [%d]", static_cast(type)); + break; + } +} + +template +uint32_t LogicalXorCpuKernel::NoBcastCompute(CpuKernelContext &ctx) { + auto input_0 = reinterpret_cast(ctx.Input(0)->GetData()); + auto input_1 = reinterpret_cast(ctx.Input(1)->GetData()); + auto out = reinterpret_cast(ctx.Output(0)->GetData()); + int64_t input_0_elements_nums = ctx.Input(0)->NumElements(); + int64_t input_1_elements_nums = ctx.Input(1)->NumElements(); + int64_t data_num = ctx.Output(0)->NumElements(); + BcastShapeType type = + input_0_elements_nums == input_1_elements_nums + ? BcastShapeType::SAME_SHAPE + : (input_0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT); + + if (data_num >= kParallelDataNumSameShape) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2); + + if (data_num <= kParallelDataNumSameShapeMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + + if (max_core_num > data_num) { + max_core_num = data_num; + } + + auto sharder_LogicalXor = [&](int64_t start, int64_t end) { + SpecialCompute(type, start, end, input_0, input_1, out); + }; + + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_LogicalXor), + "LogicalXor Compute failed."); + } else { + SpecialCompute(type, 0, data_num, input_0, input_1, out); + } + + return KERNEL_STATUS_OK; +} + +template +uint32_t LogicalXorCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) { + auto input_0 = reinterpret_cast(ctx.Input(0)->GetData()); + auto input_1 = reinterpret_cast(ctx.Input(1)->GetData()); + auto out = reinterpret_cast(ctx.Output(0)->GetData()); + int64_t data_num = ctx.Output(0)->NumElements(); + + if (data_num >= kParallelDataNum) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2); + + if (data_num <= kParallelDataNumMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + + if (max_core_num > data_num) { + max_core_num = data_num; + } + + auto sharder_LogicalXor = [&](int64_t start, int64_t end) { + for (int64_t i = start; i < end; ++i) { + *(out + i) = + *(input_0 + bcast.GetBroadcastXIndex(i)) != *(input_1 + bcast.GetBroadcastYIndex(i)) ? true : false; + } + }; + + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_LogicalXor), + "LogicalXor Compute failed."); + } else { + for (int64_t i = 0; i < data_num; ++i) { + *(out + i) = *(input_0 + bcast.GetBroadcastXIndex(i)) != *(input_1 + bcast.GetBroadcastYIndex(i)) ? true : false; + } + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LogicalXorCpuKernel::LogicalXorCompute(CpuKernelContext &ctx) { + Tensor *input0_tensor = ctx.Input(0); + auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes(); + int64_t input0_elements_nums = input0_tensor->NumElements(); + + Tensor *input1_tensor = ctx.Input(1); + auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes(); + int64_t input1_elements_nums = input1_tensor->NumElements(); + + bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1); + if (isNeedBcast) { + return NoBcastCompute(ctx); + } else { + Bcast bcast(input0_shape, input1_shape); + if (!bcast.IsValid()) { + KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + return BcastCompute(ctx, bcast); + } + + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kLogicalXor, LogicalXorCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.h new file mode 100644 index 00000000000..0a48bbdbe60 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logical_xor.h @@ -0,0 +1,49 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_LOGICALXOR_H_ +#define AICPU_KERNELS_NORMALIZED_LOGICALXOR_H_ + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" + +namespace aicpu { + +class LogicalXorCpuKernel : public CpuKernel { + public: + LogicalXorCpuKernel() = default; + ~LogicalXorCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + static uint32_t LogicalXorCheck(CpuKernelContext &ctx); + + template + void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, bool *output); + + template + uint32_t NoBcastCompute(CpuKernelContext &ctx); + + template + uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast); + + template + uint32_t LogicalXorCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.cc new file mode 100644 index 00000000000..608006a5279 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.cc @@ -0,0 +1,533 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "pad_v3.h" + +#include +#include +#include +#include + +#include "cpu_kernel_utils.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const char *kPadV3 = "PadV3"; +constexpr int64_t kMinCoreNum = 1; +constexpr int64_t kParallelNum = 1024 * 16; +constexpr int64_t kInput3D = 3; +constexpr int64_t kInput4D = 4; +constexpr int64_t kInput5D = 5; +constexpr int64_t kPadding1D = 2; +constexpr int64_t kPadding2D = 4; +constexpr int64_t kPadding3D = 6; +constexpr int64_t kNum2 = 2; +constexpr int64_t kNum3 = 3; +constexpr int64_t kNum4 = 4; + +const std::vector mode_list = {"constant", "reflect", "edge"}; +using float16 = Eigen::half; + +#define PAD_V3_COMPUTE_CASE(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = DoCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("PadV3 kernel compute failed."); \ + return result; \ + } \ + break; \ + } +} // namespace + +namespace aicpu { +uint32_t PadV3CpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input x failed") + KERNEL_CHECK_NULLPTR(ctx.Input(1)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input paddings failed") + KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output y failed") + KERNEL_HANDLE_ERROR(CheckAndInitParams(ctx), "PadV3 check and init params failed."); + auto paddings_type = ctx.Input(1)->GetDataType(); + if (paddings_type == DT_INT32) { + KERNEL_CHECK_FALSE((GetPaddingsAndSetOuputShape(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID, + "Get paddings and set output shape failed."); + } else if (paddings_type == DT_INT64) { + KERNEL_CHECK_FALSE((GetPaddingsAndSetOuputShape(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID, + "Get paddings and set output shape failed."); + } else { + KERNEL_LOG_ERROR("PadV3 paddings data type [%s] not support.", DTypeStr(paddings_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + auto data_type_ = ctx.Input(0)->GetDataType(); + switch (data_type_) { + PAD_V3_COMPUTE_CASE(DT_INT8, int8_t, ctx) + PAD_V3_COMPUTE_CASE(DT_INT16, int16_t, ctx) + PAD_V3_COMPUTE_CASE(DT_INT32, int32_t, ctx) + PAD_V3_COMPUTE_CASE(DT_INT64, int64_t, ctx) + PAD_V3_COMPUTE_CASE(DT_UINT8, uint8_t, ctx) + PAD_V3_COMPUTE_CASE(DT_UINT16, uint16_t, ctx) + PAD_V3_COMPUTE_CASE(DT_UINT32, uint32_t, ctx) + PAD_V3_COMPUTE_CASE(DT_UINT64, uint64_t, ctx) + PAD_V3_COMPUTE_CASE(DT_FLOAT16, float16, ctx) + PAD_V3_COMPUTE_CASE(DT_FLOAT, float, ctx) + PAD_V3_COMPUTE_CASE(DT_DOUBLE, double, ctx) + PAD_V3_COMPUTE_CASE(DT_COMPLEX64, std::complex, ctx) + PAD_V3_COMPUTE_CASE(DT_COMPLEX128, std::complex, ctx) + default: + KERNEL_LOG_ERROR("PadV3 kernel data type [%s] not support.", DTypeStr(data_type_).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +int64_t PadV3CpuKernel::EdgeIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start, + int64_t i_start) { + int64_t ip; + if (now < pad_value) { + ip = pad_value; + } else if (now >= pad_value && now < input_value + pad_value) { + ip = now; + } else { + ip = input_value + pad_value - 1; + } + ip = ip - o_start + i_start; + return ip; +} + +template +uint32_t PadV3CpuKernel::EdgeCompute1D(T *input, T *output, int64_t p) { + int64_t nplane = 0; + int64_t input_w = input_shape[kNum2]; + int64_t output_w = output_shape.end()[-1]; + int64_t pad_l = paddings[0]; + int64_t i_start_x = std::max(int64_t(0), -pad_l); + int64_t o_start_x = std::max(int64_t(0), pad_l); + int64_t ip_x; + for (int64_t j = 0; j < output_w; ++j) { + ip_x = EdgeIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x); + T *dest_p = output + p * output_w * (nplane + 1) + j; + T *src_p = input + +p * input_w * (nplane + 1) + ip_x; + *dest_p = *src_p; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3CpuKernel::EdgeCompute2D(T *input, T *output, int64_t p) { + int64_t pad_l = paddings[0]; + int64_t pad_t = paddings[kNum2]; + int64_t nplane = 0; + int64_t input_h = input_shape[kNum2]; + int64_t input_w = input_shape[kNum3]; + int64_t output_h = input_h + pad_t + paddings[kNum3]; + int64_t output_w = input_w + pad_l + paddings[1]; + int64_t i_start_x = std::max(int64_t(0), -pad_l); + int64_t i_start_y = std::max(int64_t(0), -pad_t); + int64_t o_start_x = std::max(int64_t(0), pad_l); + int64_t o_start_y = std::max(int64_t(0), pad_t); + int64_t ip_x, ip_y; + for (int64_t i = 0; i < output_h; ++i) { + for (int64_t j = 0; j < output_w; ++j) { + ip_x = EdgeIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x); + ip_y = EdgeIndexCaculate(pad_t, i, input_h, o_start_y, i_start_y); + T *dest_p = output + p * output_w * output_h * (nplane + 1) + i * output_w + j; + T *src_p = input + p * input_w * input_h * (nplane + 1) + ip_y * input_w + ip_x; + *dest_p = *src_p; + } + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3CpuKernel::EdgeCompute3D(T *input, T *output, int64_t p) { + int64_t pad_l = paddings[0]; + int64_t pad_t = paddings[kNum2]; + int64_t pad_f = paddings[kNum4]; + int64_t nplane = 0; + int64_t input_d = input_shape[kNum2]; + int64_t input_h = input_shape[kNum3]; + int64_t input_w = input_shape[kNum4]; + int64_t output_d = output_shape[kNum2]; + int64_t output_h = output_shape[kNum3]; + int64_t output_w = output_shape[kNum4]; + int64_t i_start_x = std::max(int64_t(0), -pad_l); + int64_t i_start_y = std::max(int64_t(0), -pad_t); + int64_t i_start_z = std::max(int64_t(0), -pad_f); + int64_t o_start_x = std::max(int64_t(0), pad_l); + int64_t o_start_y = std::max(int64_t(0), pad_t); + int64_t o_start_z = std::max(int64_t(0), pad_f); + int64_t ip_x, ip_y, ip_z; + for (int64_t k = 0; k < output_d; ++k) { + for (int64_t j = 0; j < output_h; ++j) { + for (int64_t i = 0; i < output_w; ++i) { + ip_x = EdgeIndexCaculate(pad_l, i, input_w, o_start_x, i_start_x); + ip_y = EdgeIndexCaculate(pad_t, j, input_h, o_start_y, i_start_y); + ip_z = EdgeIndexCaculate(pad_f, k, input_d, o_start_z, i_start_z); + T *dest_p = + output + p * output_w * output_h * output_d * (nplane + 1) + k * output_w * output_h + j * output_w + i; + T *src_p = + input + p * input_w * input_h * input_d * (nplane + 1) + ip_z * input_w * input_h + ip_y * input_w + ip_x; + *dest_p = *src_p; + } + } + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3CpuKernel::EdgeModeCompute(CpuKernelContext &ctx, int64_t p) { + auto input = reinterpret_cast(ctx.Input(0)->GetData()); + auto output = reinterpret_cast(ctx.Output(0)->GetData()); + if (paddings_num == kPadding1D) { + EdgeCompute1D(input, output, p); + } else if (paddings_num == kPadding2D) { + EdgeCompute2D(input, output, p); + } else if (paddings_num == kPadding3D) { + EdgeCompute3D(input, output, p); + } + return KERNEL_STATUS_OK; +} + +int64_t PadV3CpuKernel::ReflectIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start, + int64_t i_start) { + int64_t ip; + if (now < pad_value) { + ip = pad_value + pad_value - now; + } else if (now >= pad_value && now < input_value + pad_value) { + ip = now; + } else { + ip = (input_value + pad_value - 1) + (input_value + pad_value - 1) - now; + } + ip = ip - o_start + i_start; + return ip; +} + +template +uint32_t PadV3CpuKernel::ReflectCompute1D(T *input, T *output, int64_t p) { + int64_t nplane = 0; + int64_t input_w = input_shape[kNum2]; + int64_t output_w = output_shape.end()[-1]; + int64_t pad_l = paddings[0]; + int64_t i_start_x = std::max(int64_t(0), -pad_l); + int64_t o_start_x = std::max(int64_t(0), pad_l); + int64_t ip_x; + for (int64_t j = 0; j < output_w; ++j) { + ip_x = ReflectIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x); + T *dest_p = output + p * output_w * (nplane + 1) + j; + T *src_p = input + +p * input_w * (nplane + 1) + ip_x; + *dest_p = *src_p; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3CpuKernel::ReflectCompute2D(T *input, T *output, int64_t p) { + int64_t pad_l = paddings[0]; + int64_t pad_t = paddings[kNum2]; + int64_t nplane = 0; + int64_t input_h = input_shape[kNum2]; + int64_t input_w = input_shape[kNum3]; + int64_t output_h = input_h + pad_t + paddings[kNum3]; + int64_t output_w = input_w + pad_l + paddings[1]; + int64_t i_start_x = std::max(int64_t(0), -pad_l); + int64_t i_start_y = std::max(int64_t(0), -pad_t); + int64_t o_start_x = std::max(int64_t(0), pad_l); + int64_t o_start_y = std::max(int64_t(0), pad_t); + int64_t ip_x, ip_y; + for (int64_t i = 0; i < output_h; ++i) { + for (int64_t j = 0; j < output_w; ++j) { + ip_x = ReflectIndexCaculate(pad_l, j, input_w, o_start_x, i_start_x); + ip_y = ReflectIndexCaculate(pad_t, i, input_h, o_start_y, i_start_y); + T *dest_p = output + p * output_w * output_h * (nplane + 1) + i * output_w + j; + T *src_p = input + p * input_w * input_h * (nplane + 1) + ip_y * input_w + ip_x; + *dest_p = *src_p; + } + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3CpuKernel::ReflectCompute3D(T *input, T *output, int64_t p) { + int64_t pad_l = paddings[0]; + int64_t pad_t = paddings[kNum2]; + int64_t pad_f = paddings[kNum4]; + int64_t nplane = 0; + int64_t input_d = input_shape[kNum2]; + int64_t input_h = input_shape[kNum3]; + int64_t input_w = input_shape[kNum4]; + int64_t output_d = output_shape[kNum2]; + int64_t output_h = output_shape[kNum3]; + int64_t output_w = output_shape[kNum4]; + int64_t i_start_x = std::max(int64_t(0), -pad_l); + int64_t i_start_y = std::max(int64_t(0), -pad_t); + int64_t i_start_z = std::max(int64_t(0), -pad_f); + int64_t o_start_x = std::max(int64_t(0), pad_l); + int64_t o_start_y = std::max(int64_t(0), pad_t); + int64_t o_start_z = std::max(int64_t(0), pad_f); + int64_t ip_x, ip_y, ip_z; + for (int64_t k = 0; k < output_d; ++k) { + for (int64_t j = 0; j < output_h; ++j) { + for (int64_t i = 0; i < output_w; ++i) { + ip_x = ReflectIndexCaculate(pad_l, i, input_w, o_start_x, i_start_x); + ip_y = ReflectIndexCaculate(pad_t, j, input_h, o_start_y, i_start_y); + ip_z = ReflectIndexCaculate(pad_f, k, input_d, o_start_z, i_start_z); + T *dest_p = + output + p * output_w * output_h * output_d * (nplane + 1) + k * output_w * output_h + j * output_w + i; + T *src_p = + input + p * input_w * input_h * input_d * (nplane + 1) + ip_z * input_w * input_h + ip_y * input_w + ip_x; + *dest_p = *src_p; + } + } + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3CpuKernel::ReflectModeCompute(CpuKernelContext &ctx, int64_t p) { + auto input = reinterpret_cast(ctx.Input(0)->GetData()); + auto output = reinterpret_cast(ctx.Output(0)->GetData()); + if (paddings_num == kPadding1D) { + ReflectCompute1D(input, output, p); + } else if (paddings_num == kPadding2D) { + ReflectCompute2D(input, output, p); + } else if (paddings_num == kPadding3D) { + ReflectCompute3D(input, output, p); + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3CpuKernel::ConstantModeCompute(CpuKernelContext &ctx, T constant_values) { + auto input_ptr = reinterpret_cast(ctx.Input(0)->GetData()); + auto output_ptr = reinterpret_cast(ctx.Output(0)->GetData()); + int64_t output_num = ctx.Output(0)->NumElements(); + int64_t input_num = 1; + std::vector input_strides(input_dims, 0); + std::vector output_strides(input_dims, 0); + input_strides[input_dims - 1] = 1; + output_strides[input_dims - 1] = 1; + for (int64_t i = input_dims - 1; i >= 1; --i) { + input_strides[i - 1] = input_strides[i] * input_shape[i]; + output_strides[i - 1] = output_strides[i] * output_shape[i]; + } + std::vector offsets(input_dims, 0); + std::vector extents(input_dims, 0); + for (int64_t i = input_dims - 1; i >= 0; --i) { + extents[i] = input_shape[i]; + if (paddings[i * kNum2] < 0) { + extents[i] += paddings[i * kNum2]; + offsets[i] = -paddings[i * kNum2]; + paddings[i * kNum2] = 0; + } + if (paddings[i * kNum2 + 1] < 0) { + extents[i] += paddings[i * kNum2 + 1]; + paddings[i * kNum2 + 1] = 0; + } + input_shape[i] = extents[i]; + input_num *= input_shape[i]; + } + std::vector input_values; + for (int64_t i = 0; i < input_num; ++i) { + int64_t k = i; + int64_t p = 0; + for (int64_t j = input_dims - 1; j >= 0; --j) { + p += (offsets[j] + (k % extents[j])) * input_strides[j]; + k /= extents[j]; + } + input_values.push_back(*(input_ptr + p)); + } + for (int64_t i = 0; i < output_num; ++i) { + *(output_ptr + i) = constant_values; + } + if (input_dims == 1) { + for (int64_t i = 0; i < input_num; ++i) { + *(output_ptr + paddings[0] + i) = input_values[i]; + } + return KERNEL_STATUS_OK; + } + std::vector i_inx_add(input_dims, 0); + std::vector o_inx_add(input_dims, 0); + i_inx_add[input_dims - 1] = output_strides[input_dims - 1] * paddings[kNum2 * (input_dims - 1)]; + o_inx_add[input_dims - 1] = output_strides[input_dims - 1] * paddings[kNum2 * (input_dims - 1) + 1]; + for (int64_t i = input_dims - 1; i >= 1; --i) { + i_inx_add[i - 1] = i_inx_add[i] + output_strides[i - 1] * paddings[kNum2 * (i - 1)]; + o_inx_add[i - 1] = o_inx_add[i] + output_strides[i - 1] * paddings[kNum2 * (i - 1) + 1]; + } + int64_t i_inx = 0; + int64_t o_inx = i_inx_add[0]; + std::vector pos(input_dims - 1, 0); + while (i_inx < input_num) { + for (int64_t i = 0; i < input_shape[input_dims - 1]; ++i) { + *(output_ptr + o_inx + i) = input_values[i_inx + i]; + } + pos[input_dims - kNum2] += 1; + int64_t dep = input_dims - 1; + for (int64_t i = input_dims - kNum2; i >= 0; --i) { + if (i > 0 && pos[i] >= input_shape[i]) { + pos[i] -= input_shape[i]; + pos[i - 1] += 1; + dep = i; + } else { + break; + } + } + o_inx += i_inx_add[dep] + o_inx_add[dep] + input_shape[input_dims - 1]; + i_inx += input_shape[input_dims - 1]; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3CpuKernel::DoCompute(CpuKernelContext &ctx) { + if (mode == "constant") { + T constant_values = static_cast(0); + if (ctx.Input(kNum2) != nullptr) { + constant_values = *(reinterpret_cast(ctx.Input(kNum2)->GetData())); + } else { + KERNEL_LOG_DEBUG("Get attr [constant_values] failed, use default value [0]"); + } + for (int64_t i = 0; i < input_dims / kNum2; ++i) { + int64_t u = paddings[i * kNum2]; + int64_t v = paddings[i * kNum2 + 1]; + paddings[i * kNum2] = paddings[kNum2 * (input_dims - i - 1)]; + paddings[i * kNum2 + 1] = paddings[kNum2 * (input_dims - i - 1) + 1]; + paddings[kNum2 * (input_dims - i - 1)] = u; + paddings[kNum2 * (input_dims - i - 1) + 1] = v; + } + ConstantModeCompute(ctx, constant_values); + } else if (mode == "reflect") { + auto shard_padv3_reflcet = [&](int64_t start, int64_t end) { + for (int p = start; p < end; p++) { + ReflectModeCompute(ctx, p); + } + }; + const int64_t data_num = parallelSliceNum; + const bool enable_parallel = data_num > kParallelNum; + if (enable_parallel) { + const int64_t max_core_num = + std::max(static_cast(kMinCoreNum), static_cast(aicpu::CpuKernelUtils::GetCPUNum(ctx))); + const int64_t per_unit_size = data_num / std::min(data_num, max_core_num); + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, per_unit_size, shard_padv3_reflcet), + "PadV3 Compute failed."); + } else { + shard_padv3_reflcet(0, data_num); + } + } else if (mode == "edge") { + auto shard_padv3_edge = [&](int64_t start, int64_t end) { + for (int p = start; p < end; p++) { + EdgeModeCompute(ctx, p); + } + }; + const int64_t data_num = parallelSliceNum; + const bool enable_parallel = data_num > kParallelNum; + if (enable_parallel) { + const int64_t max_core_num = + std::max(static_cast(kMinCoreNum), static_cast(aicpu::CpuKernelUtils::GetCPUNum(ctx))); + const int64_t per_unit_size = data_num / std::min(data_num, max_core_num); + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, per_unit_size, shard_padv3_edge), + "PadV3 Compute failed."); + } else { + shard_padv3_edge(0, data_num); + } + } + return KERNEL_STATUS_OK; +} + +uint32_t PadV3CpuKernel::CheckAndInitParams(CpuKernelContext &ctx) { + if (ctx.GetAttr("mode") == nullptr) { + mode = "constant"; + KERNEL_LOG_DEBUG("Get attr [mode] failed, use default value [constant]"); + } else { + mode = ctx.GetAttr("mode")->GetString(); + const bool is_mode_available = std::find(mode_list.begin(), mode_list.end(), mode) != mode_list.end(); + if (is_mode_available == false) { + KERNEL_LOG_ERROR( + "Attr [mode] must be included in [constant, reflect, edge], but got " + "[%s]", + mode.c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + } + if (ctx.GetAttr("paddings_contiguous") != nullptr) { + paddings_contiguous = ctx.GetAttr("paddings_contiguous")->GetBool(); + } else { + paddings_contiguous = true; + KERNEL_LOG_DEBUG("Get attr [paddings_contiguous] failed, use default value [true]"); + } + if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) { + KERNEL_LOG_ERROR("Tensor y dtype[%s] must be same with x dtype[%s]", DTypeStr(ctx.Output(0)->GetDataType()).c_str(), + DTypeStr(ctx.Input(0)->GetDataType()).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + input_dims = ctx.Input(0)->GetTensorShape()->GetDims(); + const std::vector paddings_shape = ctx.Input(1)->GetTensorShape()->GetDimSizes(); + paddings_num = ctx.Input(1)->NumElements(); + KERNEL_CHECK_FALSE(paddings_shape.size() == 1 && paddings_num == input_dims * kNum2, KERNEL_STATUS_PARAM_INVALID, + "Paddings shape is not supported"); + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3CpuKernel::GetPaddingsAndSetOuputShape(CpuKernelContext &ctx) { + auto paddings_ptr = reinterpret_cast(ctx.Input(1)->GetData()); + paddings = std::vector(input_dims * kNum2, 0); + for (int64_t i = 0; i < paddings_num; i += kNum2) { + paddings[i] = static_cast(paddings_ptr[paddings_num - i - kNum2]); + paddings[i + 1] = static_cast(paddings_ptr[paddings_num - i - 1]); + } + if (mode == "edge" || mode == "reflect" || (mode == "constant" && paddings_contiguous == false)) { + paddings_num = paddings_num - kNum4; + } + if (paddings_contiguous == false) { + std::vector tmp = paddings; + for (int64_t i = 0; i < paddings_num; ++i) { + if (i % kNum2 == 0) { + paddings[i] = tmp[i / kNum2]; + } else { + paddings[i] = tmp[(i + paddings_num) / kNum2]; + } + } + } + input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + parallelSliceNum = 1; + for (int64_t i = 0; i < input_dims - paddings_num / kNum2; ++i) { + parallelSliceNum *= input_shape[i]; + } + for (int64_t i = 0; i < paddings_num / kNum2; ++i) { + output_shape.end()[-(i + 1)] += (paddings[i * kNum2] + paddings[i * kNum2 + 1]); + KERNEL_CHECK_FALSE(output_shape.end()[-(i + 1)] > 0, KERNEL_STATUS_PARAM_INVALID, + "output_shape number must be greater than 0"); + KERNEL_CHECK_FALSE(input_shape.end()[-(i + 1)] >= std::max(-paddings[i * kNum2], -paddings[i * kNum2 + 1]), + KERNEL_STATUS_PARAM_INVALID, + "Padding size should be less than the corresponding input dimension"); + if (mode == "reflect") { + KERNEL_CHECK_FALSE(input_shape.end()[-(i + 1)] > std::max(paddings[i * kNum2], paddings[i * kNum2 + 1]), + KERNEL_STATUS_PARAM_INVALID, + "Padding size should be less than the corresponding input dimension"); + } + } + if (output_shape != ctx.Output(0)->GetTensorShape()->GetDimSizes()) { + ctx.Output(0)->GetTensorShape()->SetDimSizes(output_shape); + KERNEL_LOG_DEBUG("Set output tensor shape success, num elements:[%llu]", + static_cast(ctx.Output(0)->NumElements())); + } else { + KERNEL_LOG_DEBUG("Output tensor is a const tensor, num elements:[%llu]", + static_cast(ctx.Output(0)->NumElements())); + } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kPadV3, PadV3CpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.h new file mode 100644 index 00000000000..a72958c7f08 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3.h @@ -0,0 +1,89 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_PAD_V3_H_ +#define AICPU_KERNELS_NORMALIZED_PAD_V3_H_ + +#include +#include +#include + +#include "cpu_ops_kernel.h" +#include "cpu_kernel_utils.h" +#include "kernel_log.h" +#include "securec.h" +#include "status.h" +#include "utils/bcast.h" + +namespace aicpu { +class PadV3CpuKernel : public CpuKernel { + public: + PadV3CpuKernel() = default; + ~PadV3CpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + std::vector paddings; + std::vector input_shape; + std::vector output_shape; + std::string mode; + bool paddings_contiguous; + int64_t input_dims{0}; + int64_t paddings_num{0}; + int64_t parallelSliceNum{1}; + + uint32_t CheckAndInitParams(CpuKernelContext &ctx); + + template + uint32_t GetPaddingsAndSetOuputShape(CpuKernelContext &ctx); + + template + uint32_t DoCompute(CpuKernelContext &ctx); + + template + uint32_t EdgeModeCompute(CpuKernelContext &ctx, int64_t p); + + template + uint32_t EdgeCompute3D(T *input, T *output, int64_t p); + + template + uint32_t EdgeCompute2D(T *input, T *output, int64_t p); + + template + uint32_t EdgeCompute1D(T *input, T *output, int64_t p); + + int64_t EdgeIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start, int64_t i_start); + + template + uint32_t ReflectModeCompute(CpuKernelContext &ctx, int64_t p); + + template + uint32_t ReflectCompute3D(T *input, T *output, int64_t p); + + template + uint32_t ReflectCompute2D(T *input, T *output, int64_t p); + + template + uint32_t ReflectCompute1D(T *input, T *output, int64_t p); + + int64_t ReflectIndexCaculate(int64_t pad_value, int64_t now, int64_t input_value, int64_t o_start, int64_t i_start); + + template + uint32_t ConstantModeCompute(CpuKernelContext &ctx, T constant_values); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.cc new file mode 100644 index 00000000000..33df2876f47 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.cc @@ -0,0 +1,367 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "pad_v3_grad.h" + +#include +#include +#include +#include + +#include "securec.h" +#include "cpu_kernel_utils.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const char *kPadV3Grad = "PadV3Grad"; +constexpr uint32_t kInputNum = 2; +constexpr uint32_t kOutputNum = 1; +constexpr int64_t kParallelNum = 1024 * 64; +const int64_t k3DNum = 6; +const int64_t k2DNum = 4; +const int64_t k1DNum = 2; +constexpr int64_t kpad_l = 0; +constexpr int64_t kpad_t = 2; +constexpr int64_t kpad_f = 4; +constexpr int64_t kwidth = 1; +constexpr int64_t kheight = 2; +constexpr int64_t kchannel = 3; +constexpr int64_t kInput1Dim = 3; +constexpr int64_t kInput2Dim = 4; +constexpr int64_t kInput3Dim = 5; +constexpr int64_t k2Num = 2; +constexpr int64_t k3Num = 3; +constexpr int64_t k4Num = 4; + +const std::vector mode_list = {"reflect", "edge"}; +using float16 = Eigen::half; + +#define PAD_V3_GRAD_READ_PADDINGS(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result1 = PadV3ReadPaddingsAndSetOutputShape1(CTX); \ + uint32_t result2 = PadV3ReadPaddingsAndSetOutputShape2(CTX); \ + if (result1 != KERNEL_STATUS_OK || result2 != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("PadV3Grad kernel compute failed."); \ + return result1 && result2; \ + } \ + break; \ + } + +#define PAD_V3_GRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = PadV3GradCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("PadV3Grad kernel compute failed."); \ + return result; \ + } \ + break; \ + } +} // namespace + +namespace aicpu { +uint32_t PadV3GradCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(PadV3GradCheck(ctx), "PadV3Grad check params failed."); + auto paddings_type = ctx.Input(1)->GetDataType(); + switch (paddings_type) { + PAD_V3_GRAD_READ_PADDINGS(DT_INT32, int32_t, ctx) + PAD_V3_GRAD_READ_PADDINGS(DT_INT64, int64_t, ctx) + default: + KERNEL_LOG_ERROR("PadV3Grad paddings data type [%s] not support.", DTypeStr(paddings_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + auto data_type = ctx.Output(0)->GetDataType(); + switch (data_type) { + PAD_V3_GRAD_COMPUTE_CASE(DT_INT8, int8_t, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_INT16, int16_t, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_INT32, int32_t, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_INT64, int64_t, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_UINT8, uint8_t, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_UINT16, uint16_t, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_UINT32, uint32_t, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_UINT64, uint64_t, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_FLOAT16, float16, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_FLOAT, float, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_COMPLEX64, std::complex, ctx) + PAD_V3_GRAD_COMPUTE_CASE(DT_COMPLEX128, std::complex, ctx) + default: + KERNEL_LOG_ERROR("PadV3Grad kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +uint32_t PadV3GradCpuKernel::PadV3GradCheck(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "PadV3Grad check failed."); + if (ctx.GetAttr("paddings_contiguous") == nullptr) { + padding_contiguous = true; + KERNEL_LOG_DEBUG("Get attr [paddings_contiguous] failed, use default value [true]"); + } else { + padding_contiguous = ctx.GetAttr("paddings_contiguous")->GetBool(); + } + if (ctx.GetAttr("mode") == nullptr) { + mode = "reflect"; + KERNEL_LOG_DEBUG("Get attr [mode] failed, use default value [reflect]"); + } else { + mode = ctx.GetAttr("mode")->GetString(); + const bool is_mode_available = std::find(mode_list.begin(), mode_list.end(), mode) != mode_list.end(); + if (is_mode_available == false) { + KERNEL_LOG_ERROR("Attr [mode] must be included in [reflect, edge], but got [%s]", mode.c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + } + + if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) { + KERNEL_LOG_ERROR("Tensor y dtype[%s] must be same with x dtype[%s]", DTypeStr(ctx.Output(0)->GetDataType()).c_str(), + DTypeStr(ctx.Input(0)->GetDataType()).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + const std::vector paddings_shape = ctx.Input(1)->GetTensorShape()->GetDimSizes(); + KERNEL_CHECK_FALSE( + paddings_shape.size() == 1 && (paddings_shape[0] == k3DNum + k4Num || paddings_shape[0] == k2DNum + k4Num || + paddings_shape[0] == k1DNum + k4Num || paddings_shape[0] == 1), + KERNEL_STATUS_PARAM_INVALID, "Paddings shape is not supported"); + KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() >= kInput1Dim, KERNEL_STATUS_PARAM_INVALID, + "Dims of tensor x should be greater than or equal to 3"); + KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() <= kInput3Dim, KERNEL_STATUS_PARAM_INVALID, + "Only 3D, 4D, 5D padding with non-constant padding are " + "supported for now"); + + const int64_t input_dim = ctx.Input(0)->GetTensorShape()->GetDims(); + const int64_t num_elem = ctx.Input(1)->NumElements(); + KERNEL_CHECK_FALSE(num_elem % k2Num == 0 || num_elem == 1, KERNEL_STATUS_PARAM_INVALID, + "Padding length must be divisible by 2"); + + if (input_dim == kInput1Dim) { + KERNEL_CHECK_FALSE(num_elem == k1DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID, + "3D tensors expect 6 values for padding"); + } else if (input_dim == kInput2Dim) { + KERNEL_CHECK_FALSE(num_elem == k2DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID, + "4D tensors expect 8 values for padding"); + } else if (input_dim == kInput3Dim) { + KERNEL_CHECK_FALSE(num_elem == k3DNum + k4Num || num_elem == 1, KERNEL_STATUS_PARAM_INVALID, + "5D tensors expect 10 values for padding"); + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3GradCpuKernel::PadV3ReadPaddingsAndSetOutputShape1(CpuKernelContext &ctx) { + num_elem = ctx.Input(1)->NumElements(); + input_dim = ctx.Input(0)->GetTensorShape()->GetDims(); + const std::vector input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + auto paddings_ptr = reinterpret_cast(ctx.Input(1)->GetData()); + paddings = std::vector(input_dim * k2Num, 0); + + for (int64_t i = 0; i < num_elem; i += k2Num) { + paddings[i] = static_cast(paddings_ptr[num_elem - i - k2Num]); + paddings[i + 1] = static_cast(paddings_ptr[num_elem - i - 1]); + } + num_elem = num_elem - k4Num; + if (num_elem == 1) { + num_elem = k2Num * (input_dim - k2Num); + for (int64_t i = 0; i < k2Num * (input_dim - k2Num); ++i) { + paddings[i] = static_cast(paddings_ptr[0]); + } + } + + parallelSliceNum = 1; + for (int64_t i = 0; i < input_dim - num_elem / k2Num; i++) { + parallelSliceNum *= input_shape[i]; + } + + if (padding_contiguous == false && num_elem == k3DNum) { + std::vector tmp = paddings; + paddings[1] = tmp[k3Num]; + paddings[k2Num] = tmp[1]; + paddings[k3Num] = tmp[k4Num]; + paddings[k4Num] = tmp[k2Num]; + } + + if (padding_contiguous == false && num_elem == k2DNum) { + std::vector tmp = paddings; + paddings[1] = tmp[k2Num]; + paddings[k2Num] = tmp[1]; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3GradCpuKernel::PadV3ReadPaddingsAndSetOutputShape2(CpuKernelContext &ctx) { + std::vector output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + output_shape.end()[-kwidth] -= (paddings[kpad_l] + paddings[kpad_l + 1]); + output_shape.end()[-kheight] -= (paddings[kpad_t] + paddings[kpad_t + 1]); + output_shape.end()[-kchannel] -= (paddings[kpad_f] + paddings[kpad_f + 1]); + + KERNEL_CHECK_FALSE( + output_shape.end()[-kwidth] > 0 && output_shape.end()[-kheight] > 0 && output_shape.end()[-kchannel] > 0, + KERNEL_STATUS_PARAM_INVALID, "output_shape number must be greater than 0"); + + if (output_shape != ctx.Output(0)->GetTensorShape()->GetDimSizes()) { + ctx.Output(0)->GetTensorShape()->SetDimSizes(output_shape); + KERNEL_LOG_DEBUG("Set output tensor shape success, num elements:[%llu]", + static_cast(ctx.Output(0)->NumElements())); + } else { + KERNEL_LOG_DEBUG("Output tensor is a const tensor, num elements:[%llu]", + static_cast(ctx.Output(0)->NumElements())); + } + const std::string padding_contiguous_str = padding_contiguous ? std::string("True") : std::string("False"); + KERNEL_LOG_DEBUG( + "PadV3GradCpuKernel[%s], x: size[%llu] dtype[%s], " + "paddings: size[%llu] dtype[%s], y: size[%llu] dtype[%s], mode: [%s], " + "padding_contiguous: [%s].", + ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), DTypeStr(ctx.Input(0)->GetDataType()).c_str(), + ctx.Input(1)->GetDataSize(), DTypeStr(ctx.Input(1)->GetDataType()).c_str(), ctx.Output(0)->GetDataSize(), + DTypeStr(ctx.Output(0)->GetDataType()).c_str(), mode.c_str(), padding_contiguous_str.c_str()); + return KERNEL_STATUS_OK; +} + +int64_t PadV3GradCpuKernel::IndexCaculate(int64_t pad_value, int64_t now, int64_t output_value, int64_t o_start, + int64_t i_start) { + int64_t ip = 0; + if (now < pad_value) { + if (mode == "reflect") { + ip = pad_value + pad_value - now; + } else if (mode == "edge") { + ip = pad_value; + } + } else if (now >= pad_value && now < output_value + pad_value) { + ip = now; + } else { + if (mode == "reflect") { + ip = (output_value + pad_value - 1) + (output_value + pad_value - 1) - now; + } else if (mode == "edge") { + ip = output_value + pad_value - 1; + } + } + ip = ip - o_start + i_start; + return ip; +} + +template +uint32_t PadV3GradCpuKernel::PadV3GradCompute1(T *input, T *output, int64_t p) { + if (num_elem == k1DNum) { + PadV3GradCompute1D(input, output, p); + } else if (num_elem == k2DNum) { + for (int i = 0; i < input_h; i++) { + PadV3GradCompute2D(input, output, p, i); + } + } else if (num_elem == k3DNum) { + for (int z = 0; z < input_c; z++) { + PadV3GradCompute3D(input, output, p, z); + } + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3GradCpuKernel::PadV3GradCompute1D(T *input, T *output, int64_t p) { + int ip_x; + for (int j = 0; j < input_w; j++) { + ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x); + T *src_p = input + p * input_w + j; + T *dest_p = output + p * output_w + ip_x; + *dest_p += *src_p; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3GradCpuKernel::PadV3GradCompute2D(T *input, T *output, int64_t p, int64_t i) { + int ip_x, ip_y; + for (int j = 0; j < input_w; j++) { + ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x); + ip_y = IndexCaculate(pad_t, i, output_h, o_start_y, i_start_y); + T *src_p = input + p * input_w * input_h + i * input_w + j; + T *dest_p = output + p * output_w * output_h + ip_y * output_w + ip_x; + *dest_p += *src_p; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3GradCpuKernel::PadV3GradCompute3D(T *input, T *output, int64_t p, int64_t z) { + int ip_x, ip_y, ip_z; + for (int i = 0; i < input_h; i++) { + for (int j = 0; j < input_w; j++) { + ip_x = IndexCaculate(pad_l, j, output_w, o_start_x, i_start_x); + ip_y = IndexCaculate(pad_t, i, output_h, o_start_y, i_start_y); + ip_z = IndexCaculate(pad_f, z, output_c, o_start_z, i_start_z); + T *src_p = input + p * input_w * input_h * input_c + z * input_w * input_h + i * input_w + j; + T *dest_p = output + p * output_w * output_h * output_c + ip_z * output_w * output_h + ip_y * output_w + ip_x; + *dest_p += *src_p; + } + } + return KERNEL_STATUS_OK; +} + +template +uint32_t PadV3GradCpuKernel::PadV3GradCompute(CpuKernelContext &ctx) { + const std::vector input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + std::vector output_shape = ctx.Output(0)->GetTensorShape()->GetDimSizes(); + + T *input = reinterpret_cast(ctx.Input(0)->GetData()); + T *output = reinterpret_cast(ctx.Output(0)->GetData()); + + output_w = output_shape.end()[-kwidth]; + output_h = output_shape.end()[-kheight]; + output_c = output_shape.end()[-kchannel]; + input_w = input_shape.end()[-kwidth]; + input_h = input_shape.end()[-kheight]; + input_c = input_shape.end()[-kchannel]; + i_start_x = std::max(int64_t(0), -paddings[kpad_l]); + i_start_y = std::max(int64_t(0), -paddings[kpad_t]); + i_start_z = std::max(int64_t(0), -paddings[kpad_f]); + o_start_x = std::max(int64_t(0), paddings[kpad_l]); + o_start_y = std::max(int64_t(0), paddings[kpad_t]); + o_start_z = std::max(int64_t(0), paddings[kpad_f]); + pad_l = paddings[kpad_l]; + pad_t = paddings[kpad_t]; + pad_f = paddings[kpad_f]; + + int64_t output_num_ = 1; + for (int64_t i = 0; i < input_dim; i++) { + output_num_ *= output_shape[i]; + } + auto ret = memset_s(output, sizeof(T) * output_num_, 0, sizeof(T) * output_num_); + if (ret != EOK) { + KERNEL_LOG_ERROR("memset_s error, ret=%d", ret); + return KERNEL_STATUS_INNER_ERROR; + } + auto shard_padv3_grad = [&](int64_t start, int64_t end) { + for (int p = start; p < end; p++) { + PadV3GradCompute1(input, output, p); + } + }; + const int64_t data_num = parallelSliceNum; + const bool enable_parallel = parallelSliceNum > kParallelNum; + if (enable_parallel) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_padv3_grad), + "PadV3Grad Compute failed."); + } else { + for (int p = 0; p < data_num; p++) { + PadV3GradCompute1(input, output, p); + } + } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kPadV3Grad, PadV3GradCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.h new file mode 100644 index 00000000000..cd78c06a307 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/pad_v3_grad.h @@ -0,0 +1,81 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_PAD_V3_GRAD_H_ +#define AICPU_KERNELS_NORMALIZED_PAD_V3_GRAD_H_ + +#include + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" + +namespace aicpu { +class PadV3GradCpuKernel : public CpuKernel { + public: + PadV3GradCpuKernel() = default; + ~PadV3GradCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + bool padding_contiguous = true; + std::string mode = "reflect"; + std::vector paddings; + int64_t output_w; + int64_t output_h; + int64_t output_c; + int64_t input_w; + int64_t input_h; + int64_t input_c; + int64_t i_start_x; + int64_t i_start_y; + int64_t i_start_z; + int64_t o_start_x; + int64_t o_start_y; + int64_t o_start_z; + int64_t pad_l; + int64_t pad_t; + int64_t pad_f; + int64_t parallelSliceNum; + int64_t num_elem; + int64_t input_dim; + uint32_t PadV3GradCheck(CpuKernelContext &ctx); + + template + uint32_t PadV3GradCompute(CpuKernelContext &ctx); + + template + uint32_t PadV3GradCompute1D(T *input, T *output, int64_t p); + + template + uint32_t PadV3GradCompute2D(T *input, T *output, int64_t p, int64_t i); + + template + uint32_t PadV3GradCompute3D(T *input, T *output, int64_t p, int64_t z); + + template + uint32_t PadV3GradCompute1(T *input, T *output, int64_t p); + + int64_t IndexCaculate(int64_t pad_value, int64_t now, int64_t output_value, int64_t o_start, int64_t i_start); + + template + uint32_t PadV3ReadPaddingsAndSetOutputShape1(CpuKernelContext &ctx); + + template + uint32_t PadV3ReadPaddingsAndSetOutputShape2(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc index 7dff4556716..9f168721d38 100644 --- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc +++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc @@ -75,8 +75,6 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An mindspore::kFillOpName, mindspore::kLogMatrixDeterminantOpName, mindspore::kMatrixSolveLsOpName, - mindspore::kMaskedSelectOpName, - mindspore::kMaskedSelectGradOpName, mindspore::kMedianOpName, mindspore::kMedianGradOpName, mindspore::kNMSWithMaskOpName, @@ -178,7 +176,11 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An mindspore::kLuUnpackOpName, mindspore::kLuUnpackGradOpName, mindspore::kMatMulOpName, - mindspore::kMatrixExpOpName}; + mindspore::kMatrixExpOpName, + mindspore::kPadV3GradOpName, + mindspore::kPadV3OpName, + mindspore::kLogicalXorOpName, + mindspore::kLogNormalReverseOpName}; static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels"; static const std::string kCpuKernelSoName = "mindspore_cpu_kernels"; diff --git a/mindspore/python/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/python/mindspore/ops/_op_impl/aicpu/__init__.py index befabc968a5..6a21cadee6c 100644 --- a/mindspore/python/mindspore/ops/_op_impl/aicpu/__init__.py +++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/__init__.py @@ -277,3 +277,5 @@ from .lstsq import _lstsq_aicpu from .lu_unpack import _lu_unpack_aicpu from .lu_unpack_grad import _lu_unpack_grad_aicpu from .matrix_exp import _matrix_exp_aicpu +from .pad_v3_grad import _pad_v3_grad_aicpu +from .pad_v3 import _pad_v3_aicpu diff --git a/mindspore/python/mindspore/ops/composite/math_ops.py b/mindspore/python/mindspore/ops/composite/math_ops.py index bbd83149ba4..40d1829c64d 100644 --- a/mindspore/python/mindspore/ops/composite/math_ops.py +++ b/mindspore/python/mindspore/ops/composite/math_ops.py @@ -765,8 +765,12 @@ def resize_nearest_neighbor(input_x, size, align_corners=False): ``Ascend`` ``GPU`` ``CPU`` Examples: + >>> import numpy as np + >>> import mindspore + >>> from mindspore import Tensor, ops >>> input_tensor = Tensor(np.array([[[[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]]]), mindspore.float32) - >>> output = ops.ResizeNearestNeighbor(input_tensor, (2, 2)) + >>> size = (2, 2) + >>> output = ops.ResizeNearestNeighbor(size=size)(input_tensor) >>> print(output) [[[[-0.1 0.3] [ 0.4 0.5]]]] diff --git a/mindspore/python/mindspore/ops/operations/array_ops.py b/mindspore/python/mindspore/ops/operations/array_ops.py index b8ec73547c9..d4514fcacf1 100755 --- a/mindspore/python/mindspore/ops/operations/array_ops.py +++ b/mindspore/python/mindspore/ops/operations/array_ops.py @@ -3744,9 +3744,12 @@ class ResizeNearestNeighbor(Primitive): ``Ascend`` ``GPU`` ``CPU`` Examples: + >>> import numpy as np + >>> import mindspore + >>> from mindspore import Tensor, ops >>> input_tensor = Tensor(np.array([[[[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]]]), mindspore.float32) - >>> resize = ops.ResizeNearestNeighbor((2, 2)) - >>> output = resize(input_tensor) + >>> size = (2, 2) + >>> output = ops.ResizeNearestNeighbor(size=size)(input_tensor) >>> print(output) [[[[-0.1 0.3] [ 0.4 0.5]]]]