diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.cc new file mode 100644 index 00000000000..11979e058f9 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.cc @@ -0,0 +1,228 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "hypot.h" + +#include "cpu_kernel_utils.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t kOutputNum = 1; +const uint32_t kInputNum = 2; +const char *kHypot = "Hypot"; +const int64_t kParallelDataNum = 2 * 1024; +const int64_t kParallelDataNumMid = 16 * 1024; +const int64_t kParallelDataNumSameShape = 7 * 1024; +const int64_t kParallelDataNumSameShapeMid = 35 * 1024; + +#define HYPOT_COMPUTE_CASE(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = HypotCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("Hypot kernel compute failed."); \ + return result; \ + } \ + break; \ + } +} // namespace + +namespace aicpu { +template +T hypot(T a, T b) { + return std::hypot(a, b); +} + +uint32_t HypotCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Hypot check input and output number failed."); + KERNEL_HANDLE_ERROR(HypotParamCheck(ctx), "Hypot check params failed."); + auto data_type = ctx.Input(0)->GetDataType(); + switch (data_type) { + HYPOT_COMPUTE_CASE(DT_FLOAT, float_t, ctx) + HYPOT_COMPUTE_CASE(DT_DOUBLE, double_t, ctx) + default: + KERNEL_LOG_ERROR("Hypot kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + return KERNEL_STATUS_OK; +} + +uint32_t HypotCpuKernel::HypotParamCheck(CpuKernelContext &ctx) { + Tensor *input_0 = ctx.Input(0); + Tensor *input_1 = ctx.Input(1); + Tensor *output = ctx.Output(0); + DataType input0_type = input_0->GetDataType(); + DataType input1_type = input_1->GetDataType(); + KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID, + "The data type of input0 [%s] need be same with " + "input1 [%s].", + DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str()) + KERNEL_LOG_DEBUG( + "HypotCpuKernel[%s], input0: size[%llu];" + "input1: size[%llu], output: size[%llu].", + ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize()); + + return KERNEL_STATUS_OK; +} + +template +uint32_t HypotCpuKernel::NoBcastCompute(CpuKernelContext &ctx) { + auto in0 = reinterpret_cast(ctx.Input(0)->GetData()); + auto in1 = reinterpret_cast(ctx.Input(1)->GetData()); + auto out = reinterpret_cast(ctx.Output(0)->GetData()); + int64_t in0_elements_nums = ctx.Input(0)->NumElements(); + int64_t in1_elements_nums = ctx.Input(1)->NumElements(); + int64_t data_num = ctx.Output(0)->NumElements(); + BcastShapeType type; + + if (in0_elements_nums == in1_elements_nums) { + type = BcastShapeType::SAME_SHAPE; + } else { + type = (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT); + } + + if (data_num >= kParallelDataNumSameShape) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + + if (data_num <= kParallelDataNumSameShapeMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + + if (max_core_num > data_num) { + max_core_num = data_num; + } + + auto sharder_hypot = [&](int64_t start, int64_t end) { + switch (type) { + case BcastShapeType::SAME_SHAPE: + for (int64_t i = start; i < end; ++i) { + *(out + i) = hypot(*(in0 + i), *(in1 + i)); + } + break; + case BcastShapeType::X_ONE_ELEMENT: + for (int64_t i = start; i < end; ++i) { + *(out + i) = hypot(*in0, *(in1 + i)); + } + break; + case BcastShapeType::Y_ONE_ELEMENT: + for (int64_t i = start; i < end; ++i) { + *(out + i) = hypot(*(in0 + i), *in1); + } + break; + default: + KERNEL_LOG_ERROR("Invalid type [%d]", static_cast(type)); + break; + } + }; + if (max_core_num == 0) { + KERNEL_LOG_ERROR("max_core_num could not be 0"); + return KERNEL_STATUS_PARAM_INVALID; + } + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_hypot), + "Hypot Compute failed."); + } else { + switch (type) { + case BcastShapeType::SAME_SHAPE: + for (int64_t i = static_cast(0); i < data_num; ++i) { + *(out + i) = hypot(*(in0 + i), *(in1 + i)); + } + break; + case BcastShapeType::X_ONE_ELEMENT: + for (int64_t i = static_cast(0); i < data_num; ++i) { + *(out + i) = hypot(*in0, *(in1 + i)); + } + break; + case BcastShapeType::Y_ONE_ELEMENT: + for (int64_t i = static_cast(0); i < data_num; ++i) { + *(out + i) = hypot(*(in0 + i), *in1); + } + break; + default: + KERNEL_LOG_ERROR("Invalid type [%d]", static_cast(type)); + break; + } + } + + return KERNEL_STATUS_OK; +} + +template +uint32_t HypotCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) { + T *in0 = reinterpret_cast(ctx.Input(0)->GetData()); + T *in1 = reinterpret_cast(ctx.Input(1)->GetData()); + T *out = reinterpret_cast(ctx.Output(0)->GetData()); + int64_t data_num = ctx.Output(0)->NumElements(); + if (data_num >= kParallelDataNum) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + + if (data_num <= kParallelDataNumMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + + if (max_core_num > data_num) { + max_core_num = data_num; + } + + auto sharder_hypot = [&](int64_t start, int64_t end) { + for (int64_t i = start; i < end; ++i) { + *(out + i) = hypot(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i))); + } + }; + + if (max_core_num == 0) { + KERNEL_LOG_ERROR("max_core_num could not be 0"); + return KERNEL_STATUS_PARAM_INVALID; + } + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_hypot), + "Hypot Compute failed."); + } else { + for (int64_t i = 0; i < data_num; ++i) { + *(out + i) = hypot(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i))); + } + } + return KERNEL_STATUS_OK; +} + +template +uint32_t HypotCpuKernel::HypotCompute(CpuKernelContext &ctx) { + Tensor *input0_tensor = ctx.Input(0); + auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes(); + int64_t input0_elements_nums = input0_tensor->NumElements(); + + Tensor *input1_tensor = ctx.Input(1); + auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes(); + int64_t input1_elements_nums = input1_tensor->NumElements(); + + bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1); + if (isNeedBcast) { + return NoBcastCompute(ctx); + } else { + Bcast bcast(input0_shape, input1_shape); + if (!bcast.IsValid()) { + KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + return BcastCompute(ctx, bcast); + } + + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kHypot, HypotCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.h new file mode 100644 index 00000000000..298dc3ad567 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/hypot.h @@ -0,0 +1,43 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_HYPOT_H_ +#define AICPU_KERNELS_NORMALIZED_HYPOT_H_ + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" +namespace aicpu { +class HypotCpuKernel : public CpuKernel { + public: + HypotCpuKernel() = default; + ~HypotCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + uint32_t HypotParamCheck(CpuKernelContext &ctx); + + template + uint32_t NoBcastCompute(CpuKernelContext &ctx); + + template + uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast); + + template + uint32_t HypotCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.cc new file mode 100644 index 00000000000..9e5865b4e9e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.cc @@ -0,0 +1,81 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "identityn.h" +#include +#include +#include "cpu_types.h" +#include "kernel_log.h" +#include "securec.h" +#include "status.h" +#include "utils/kernel_util.h" + +namespace { +const char *kIdentityN = "IdentityN"; +} // namespace + +namespace aicpu { +uint32_t IdentityNCpuKernel::IdentityNParamCheck(CpuKernelContext &ctx) { + // input size and output size check + uint32_t input_size = ctx.GetInputsSize(); + uint32_t output_size = ctx.GetOutputsSize(); + KERNEL_CHECK_FALSE((input_size == output_size), KERNEL_STATUS_PARAM_INVALID, + "Input size should equal to Output size."); + KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_size, output_size), "[%s] check params failed.", kIdentityN); + for (uint32_t idx = 0; idx < input_size; ++idx) { + Tensor *in_tensor = ctx.Input(idx); + Tensor *out_tensor = ctx.Output(idx); + // TensorShape check + auto in_shape = in_tensor->GetTensorShape(); + auto out_shape = out_tensor->GetTensorShape(); + KERNEL_CHECK_FALSE((in_shape->GetDimSizes() == out_shape->GetDimSizes()), KERNEL_STATUS_PARAM_INVALID, + "In tensor shape should equal to out tensor shape."); + // DataType Check + DataType in_type = in_tensor->GetDataType(); + DataType out_type = out_tensor->GetDataType(); + KERNEL_CHECK_FALSE((in_type == out_type), KERNEL_STATUS_PARAM_INVALID, + "In tensor data type should equal to out tensor data type."); + bool type_support = + std::find(support_data_type.begin(), support_data_type.end(), in_type) != support_data_type.end(); + KERNEL_CHECK_FALSE(type_support, KERNEL_STATUS_PARAM_INVALID, "IdentityN kernel data type [%s] not support.", + DTypeStr(in_type).c_str()); + } + return KERNEL_STATUS_OK; +} + +uint32_t IdentityNCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(IdentityNParamCheck(ctx), "IdentityNCpuKernel check params failed"); + uint32_t input_size = ctx.GetInputsSize(); + for (uint32_t idx = 0; idx < input_size; ++idx) { + Tensor *in_tensor = ctx.Input(idx); + Tensor *out_tensor = ctx.Output(idx); + auto in_data = in_tensor->GetData(); + auto out_data = out_tensor->GetData(); + uint64_t in_size = in_tensor->GetDataSize(); + uint64_t out_size = out_tensor->GetDataSize(); + + // memory copy + if (out_data != in_data) { + int cpret = memcpy_s(out_data, out_size, in_data, in_size); + KERNEL_CHECK_FALSE((cpret == EOK), KERNEL_STATUS_INNER_ERROR, + "[%s] memcpy_s to output failed, destMax [%ld], count [%ld].", kIdentityN, out_size, in_size); + } + } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kIdentityN, IdentityNCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.h new file mode 100644 index 00000000000..356f073b292 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/identityn.h @@ -0,0 +1,36 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_IDENTITY_N_H_ +#define AICPU_KERNELS_NORMALIZED_IDENTITY_N_H_ + +#include "cpu_ops_kernel.h" + +namespace aicpu { +class IdentityNCpuKernel : public CpuKernel { + public: + IdentityNCpuKernel() = default; + ~IdentityNCpuKernel() = default; + + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + uint32_t IdentityNParamCheck(CpuKernelContext &ctx); + const std::vector support_data_type = {DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, + DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE}; +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.cc new file mode 100644 index 00000000000..b897fcdd0b3 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.cc @@ -0,0 +1,230 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "index_fill.h" + +#include + +#include + +#include "Eigen/Core" +#include "cpu_kernel_utils.h" +#include "cpu_types.h" +#include "kernel_log.h" +#include "status.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t kNumInput = 4; +const uint32_t kNumOutput = 1; +const char *kIndexFill = "IndexFill"; + +// when input data size is more than kParallelDataNum, use Parallel func +const uint32_t kParallelDataNum = 16 * 1024; +const uint32_t kParallelDataNumMid = 128 * 1024; + +#define INDEXFILL_COMPUTE_CASE(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = DoCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("IndexFill kernel compute failed."); \ + return result; \ + } \ + break; \ + } +} // namespace + +namespace aicpu { +uint32_t IndexFillCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) { + // check params + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kNumInput, kNumOutput), "IndexFill check input and output number failed."); + // get input Tensors + for (uint32_t i = 0; i < kNumInput; ++i) { + Tensor *tensor = ctx.Input(i); + inputs_.push_back(tensor); + } + // get output Tensors + Tensor *tensor = ctx.Output(0); + outputs_.push_back(tensor); + + int32_t value_dim = inputs_[3]->GetTensorShape()->GetDims(); + + KERNEL_CHECK_FALSE((value_dim == 0), KERNEL_STATUS_INNER_ERROR, + "IndexFill only supports a 0-dimensional value tensor, " + "but got tensor with [%d] dimension(s).", + value_dim) + + DataType dim_type = inputs_[1]->GetDataType(); + DataType index_type = inputs_[2]->GetDataType(); + + if (dim_type != DT_INT32) { + KERNEL_LOG_ERROR("IndexFill: Expected dtype int32 for dim."); + return KERNEL_STATUS_PARAM_INVALID; + } + if (index_type != DT_INT32) { + KERNEL_LOG_ERROR("IndexFill: Expected dtype int32 for index."); + return KERNEL_STATUS_PARAM_INVALID; + } + + return KERNEL_STATUS_OK; +} + +template +void IndexFillCpuKernel::SpecialCompute(int64_t start, int64_t end, const int32_t *input_dim, + std::map &index_dict) { + auto *input_x = reinterpret_cast(inputs_[0]->GetData()); + auto *input_value = reinterpret_cast(inputs_[3]->GetData()); + auto *output_y = reinterpret_cast(outputs_[0]->GetData()); + int32_t x_dim_nums = inputs_[0]->GetTensorShape()->GetDims(); + auto x_dims = inputs_[0]->GetTensorShape()->GetDimSizes(); + + int32_t dim_flag; + if (x_dim_nums != 0) { + dim_flag = *input_dim % x_dim_nums + 1; + } else { + dim_flag = 0; + } + + int32_t remain_dims = 1; + if (dim_flag == x_dim_nums) { + if (dim_flag != 0) { + remain_dims = x_dims[*input_dim]; + } + for (int64_t i = start; i < end; i++) { + int32_t index_flag = i % remain_dims; + std::map::iterator f = index_dict.find(index_flag); + if (f != index_dict.end()) { + output_y[i] = *input_value; + } else { + output_y[i] = input_x[i]; + } + } + } else { + for (int32_t i = *input_dim + 1; i < x_dim_nums; i++) { + remain_dims *= x_dims[i]; + } + for (int64_t i = start; i < end; i++) { + int32_t index_flag = (i / remain_dims) % x_dims[*input_dim]; + std::map::iterator f = index_dict.find(index_flag); + if (f != index_dict.end()) { + output_y[i] = *input_value; + } else { + output_y[i] = input_x[i]; + } + } + } +} + +template +uint32_t IndexFillCpuKernel::DoCompute(CpuKernelContext &ctx) { + int32_t *input_1 = reinterpret_cast(inputs_[1]->GetData()); + int32_t *input_2 = reinterpret_cast(inputs_[2]->GetData()); + + int32_t x_dim_nums = inputs_[0]->GetTensorShape()->GetDims(); + int32_t dim_nums = inputs_[1]->GetTensorShape()->GetDims(); + int32_t index_dim_nums = inputs_[2]->GetTensorShape()->GetDims(); + auto x_dims = inputs_[0]->GetTensorShape()->GetDimSizes(); + + uint32_t data_num = outputs_[0]->NumElements(); + int64_t index_num = inputs_[2]->GetTensorShape()->NumElements(); + + KERNEL_CHECK_FALSE(dim_nums == 0, KERNEL_STATUS_PARAM_INVALID, "Dim has to be a scalar.") + KERNEL_CHECK_FALSE(index_dim_nums <= 1, KERNEL_STATUS_PARAM_INVALID, "Index has to be a vector/scalar.") + + int32_t cur_dim = *input_1; + if (*input_1 < 0) { + *input_1 = *input_1 + x_dim_nums; + } + + std::map index_dict; + if (x_dim_nums == 0) { + for (int32_t i = 0; i < index_num; i++) { + if (input_2[i] < -1 || input_2[i] > 0) { + KERNEL_LOG_ERROR("Invalid argument 3: out of range."); + return KERNEL_STATUS_PARAM_INVALID; + } else { + index_dict.insert(std::pair(0, true)); + } + } + } else if (cur_dim < -x_dim_nums || cur_dim >= x_dim_nums) { + KERNEL_LOG_ERROR( + "Dimension out of range (expected to be in range of " + "[%d, %d], but got %d).", + 0 - x_dim_nums, x_dim_nums - 1, cur_dim); + return KERNEL_STATUS_PARAM_INVALID; + } else { + for (int32_t i = 0; i < index_num; i++) { + if (input_2[i] < -x_dims[*input_1] || input_2[i] >= x_dims[*input_1]) { + KERNEL_LOG_ERROR("Invalid argument 3: out of range."); + return KERNEL_STATUS_PARAM_INVALID; + } else { + input_2[i] = (input_2[i] < 0) ? (input_2[i] + x_dims[*input_1]) : input_2[i]; + index_dict.insert(std::pair(input_2[i], true)); + } + } + } + + if (data_num >= kParallelDataNum) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + + if (data_num <= kParallelDataNumMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + if (max_core_num > data_num) { + max_core_num = data_num; + } + if (max_core_num == 0) { + KERNEL_LOG_ERROR("The number of available CPU cores must be greater than 0!"); + } + + auto sharder_index_fill = [&](int64_t start, int64_t end) { SpecialCompute(start, end, input_1, index_dict); }; + + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_index_fill), + "IndexFill Compute failed."); + } else { + SpecialCompute(0, data_num, input_1, index_dict); + } + return KERNEL_STATUS_OK; +} + +uint32_t IndexFillCpuKernel::Compute(CpuKernelContext &ctx) { + uint32_t res = GetInputAndCheck(ctx); + if (res != KERNEL_STATUS_OK) { + return res; + } + + DataType input_type{ctx.Input(0)->GetDataType()}; + switch (input_type) { + INDEXFILL_COMPUTE_CASE(DT_INT8, int8_t, ctx) + INDEXFILL_COMPUTE_CASE(DT_INT16, int16_t, ctx) + INDEXFILL_COMPUTE_CASE(DT_INT32, int32_t, ctx) + INDEXFILL_COMPUTE_CASE(DT_INT64, int64_t, ctx) + INDEXFILL_COMPUTE_CASE(DT_UINT8, uint8_t, ctx) + INDEXFILL_COMPUTE_CASE(DT_UINT16, uint16_t, ctx) + INDEXFILL_COMPUTE_CASE(DT_UINT32, uint32_t, ctx) + INDEXFILL_COMPUTE_CASE(DT_UINT64, uint64_t, ctx) + INDEXFILL_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx) + INDEXFILL_COMPUTE_CASE(DT_FLOAT, float, ctx) + INDEXFILL_COMPUTE_CASE(DT_DOUBLE, double, ctx) + default: + KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(), + DTypeStr(input_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} +REGISTER_CPU_KERNEL(kIndexFill, IndexFillCpuKernel); +} // namespace aicpu \ No newline at end of file diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.h new file mode 100644 index 00000000000..7fad3371928 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/index_fill.h @@ -0,0 +1,40 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_INDEX_FILL_H_ +#define AICPU_KERNELS_NORMALIZED_INDEX_FILL_H_ + +#include + +#include "cpu_ops_kernel.h" + +namespace aicpu { +class IndexFillCpuKernel : public CpuKernel { + public: + ~IndexFillCpuKernel() = default; + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + uint32_t DoCompute(CpuKernelContext &ctx); + uint32_t GetInputAndCheck(CpuKernelContext &ctx); + template + void SpecialCompute(int64_t start, int64_t end, const int32_t *input_dim, std::map &index_dict); + + std::vector inputs_; + std::vector outputs_; +}; +} // namespace aicpu +#endif \ No newline at end of file diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.cc new file mode 100644 index 00000000000..2a8f21aabd9 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.cc @@ -0,0 +1,185 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kldiv.h" + +#include +#include +#include "cpu_kernel_utils.h" +#include "cpu_types.h" +#include "kernel_log.h" +#include "status.h" +#include "utils/kernel_util.h" + +namespace { +const std::uint32_t kKLDivInputNum{2}; +const std::uint32_t kKLDivOutputNum{1}; +const std::int64_t ParallelNum{4096}; +const char *kKLDiv{"KLDiv"}; +} // namespace + +namespace aicpu { +namespace detail { +template +inline std::uint32_t ComputeKLDivKernel(const CpuKernelContext &ctx) { + const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor; + auto input = static_cast(ctx.Input(0)->GetData()); + auto target = static_cast(ctx.Input(1)->GetData()); + auto output = static_cast(ctx.Output(0)->GetData()); + std::int64_t total = ctx.Input(0)->NumElements(); + std::size_t data_size = ctx.Input(0)->GetDataSize(); + uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx); + std::string reduction = ctx.GetAttr("reduction")->GetString(); + if (reduction != "sum" && reduction != "batchmean" && reduction != "none" && reduction != "mean") { + KERNEL_LOG_ERROR("%s is not a valid value for reduction", reduction.c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + bool parallel_flag = false; + if (data_size > ParallelNum * sizeof(T)) { + parallel_flag = true; + } + if (cores == 0) { + return KERNEL_STATUS_INNER_ERROR; + } + T *tmp_array = nullptr; + if (reduction == "none") { + tmp_array = output; + } else { + tmp_array = new T[total]; + } + if (parallel_flag) { + std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)}; + ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) { + std::int64_t length = end - begin; + Eigen::Map > array_input(input + begin, length, 1); + Eigen::Map > array_target(target + begin, length, 1); + Eigen::Map > array_reduce(tmp_array + begin, length, 1); + T constant_zero{0}; + array_reduce = array_target * (Eigen::log(array_target) - array_input); + for (std::int64_t idx = 0; idx < length; ++idx) { + if (!(target[begin + idx] > constant_zero)) { + array_reduce(idx) = constant_zero; + } + } + }); + } else { + Eigen::Map > array_input(input, total, 1); + Eigen::Map > array_target(target, total, 1); + Eigen::Map > array_reduce(tmp_array, total, 1); + array_reduce = array_target * (Eigen::log(array_target) - array_input); + T constant_zero{0}; + for (uint32_t idx = 0; idx < total; ++idx) { + if (!(target[idx] > constant_zero)) { + array_reduce(idx) = constant_zero; + } + } + } + Eigen::Map > reduce(tmp_array, total, 1); + if (reduction == "sum") { + output[0] = reduce.sum(); + } else if (reduction == "batchmean") { + std::vector input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + output[0] = reduce.sum() / T(input_dims[0]); + } else if (reduction == "mean") { + output[0] = reduce.mean(); + } + if (reduction != "none") { + delete[] tmp_array; + } + return KERNEL_STATUS_OK; +} + +template +inline std::uint32_t ComputeKLDiv(const CpuKernelContext &ctx) { + uint32_t result = ComputeKLDivKernel(ctx); + if (result != 0) { + KERNEL_LOG_ERROR("KLDiv compute failed."); + } + return result; +} + +inline std::uint32_t KLDivExtraCheck(const CpuKernelContext &ctx) { + if (ctx.Input(0)->GetData() == nullptr) { + KERNEL_LOG_ERROR("Get input x data failed."); + return KERNEL_STATUS_PARAM_INVALID; + } + if (ctx.Input(1)->GetData() == nullptr) { + KERNEL_LOG_ERROR("Get input target data failed."); + return KERNEL_STATUS_PARAM_INVALID; + } + if (ctx.Output(0)->GetData() == nullptr) { + KERNEL_LOG_ERROR("Get output y data failed."); + return KERNEL_STATUS_PARAM_INVALID; + } + if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) { + KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].", + DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (ctx.Input(0)->GetDataSize() != ctx.Input(1)->GetDataSize()) { + KERNEL_LOG_ERROR( + "The data size of the input [%llu] need be the same as the target " + "[%llu].", + ctx.Input(0)->GetDataSize(), ctx.Input(1)->GetDataSize()); + return KERNEL_STATUS_PARAM_INVALID; + } + std::vector input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + std::vector target_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes(); + if (input_dims.size() != target_dims.size()) { + KERNEL_LOG_ERROR( + "The data dim size of the input x [%llu] need be the same as the " + "target " + "[%llu].", + input_dims.size(), target_dims.size()); + return KERNEL_STATUS_PARAM_INVALID; + } + for (size_t index = 0; index < input_dims.size(); index++) { + if (input_dims[index] != target_dims[index]) { + KERNEL_LOG_ERROR("The data dim of the input x need be the same as the target."); + return KERNEL_STATUS_PARAM_INVALID; + } + } + return KERNEL_STATUS_OK; +} + +std::uint32_t KLDivCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) { + return NormalCheck(ctx, kKLDivInputNum, kKLDivOutputNum, {"reduction"}) ? KERNEL_STATUS_PARAM_INVALID + : KLDivExtraCheck(ctx); +} +// DT_FLOAT16, DT_FLOAT, DT_DOUBLE +std::uint32_t KLDivCompute(const CpuKernelContext &ctx) { + DataType input_type{ctx.Input(0)->GetDataType()}; + switch (input_type) { + case DT_FLOAT16: + return ComputeKLDiv(ctx); + case DT_FLOAT: + return ComputeKLDiv(ctx); + case DT_DOUBLE: + return ComputeKLDiv(ctx); + default: + KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } +} +} // namespace detail + +std::uint32_t KLDivCpuKernel::Compute(CpuKernelContext &ctx) { + return detail::KLDivCheck(ctx, kKLDivInputNum, kKLDivOutputNum) ? KERNEL_STATUS_PARAM_INVALID + : detail::KLDivCompute(ctx); +} + +REGISTER_CPU_KERNEL(kKLDiv, KLDivCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.h new file mode 100644 index 00000000000..235eb15dee5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldiv.h @@ -0,0 +1,27 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_KLDIV_H_ +#define AICPU_KERNELS_NORMALIZED_KLDIV_H_ + +#include "cpu_ops_kernel.h" + +namespace aicpu { +class KLDivCpuKernel final : public CpuKernel { + virtual std::uint32_t Compute(CpuKernelContext &ctx) override final; +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.cc new file mode 100644 index 00000000000..3b3f771a378 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.cc @@ -0,0 +1,226 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "kldivlossgrad.h" + +#include +#include "cpu_kernel_utils.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const char *kKlDivLossGrad = "KlDivLossGrad"; +const uint32_t kOutputNum = 1; +const uint32_t kInputNum = 3; +const uint32_t kGradIndex = 0; +const uint32_t kInputIndex = 1; +const uint32_t kTargetIndex = 2; +const std::string AttrReduction = "reduction"; +const std::string AttrLog = "log_target"; +const int64_t DataDefaultParallelNum = 16384; +} // namespace + +namespace aicpu { +template +void KlDivLossGradOp(Eigen::Map > &target, + Eigen::Map > &grad, + Eigen::Map > &output, std::int64_t &len, bool &log_target, + std::string &reduction) { + T constant_zero{0}; + if (log_target) { + output = -Eigen::exp(target) * grad; + return; + } + if (reduction == "none") { + for (uint32_t idx = 0; idx < len; ++idx) { + if (target(idx) > constant_zero) { + output(idx) = -target(idx) * grad(idx); + } + } + } else { + for (uint32_t idx = 0; idx < len; ++idx) { + if (target(idx) > constant_zero) { + output(idx) = -target(idx) * grad(0); + } + } + } + return; +} + +std::uint32_t KlDivLossGradExtraCheck(CpuKernelContext &ctx) { + Tensor *grad = ctx.Input(0); + Tensor *input = ctx.Input(1); + Tensor *target = ctx.Input(2); + Tensor *output = ctx.Output(0); + if (grad->GetDataSize() == 0) { + KERNEL_LOG_ERROR("[%s] grad is empty tensor.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (input->GetDataSize() == 0) { + KERNEL_LOG_ERROR("[%s] input is empty tensor.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (target->GetDataSize() == 0) { + KERNEL_LOG_ERROR("[%s] target is empty tensor.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (output->GetDataSize() == 0) { + KERNEL_LOG_ERROR("[%s] output is empty tensor.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if ((input->GetDataType() != grad->GetDataType()) || (target->GetDataType() != grad->GetDataType()) || + (output->GetDataType() != grad->GetDataType())) { + KERNEL_LOG_ERROR( + "The data type of the grad [%s], input [%s], target [%s], output y " + "[%s] must be the same type.", + DTypeStr(grad->GetDataType()).c_str(), DTypeStr(input->GetDataType()).c_str(), + DTypeStr(target->GetDataType()).c_str(), DTypeStr(output->GetDataType()).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + std::vector grad_dims = ctx.Input(kGradIndex)->GetTensorShape()->GetDimSizes(); + std::vector input_dims = ctx.Input(kInputIndex)->GetTensorShape()->GetDimSizes(); + std::vector target_dims = ctx.Input(kTargetIndex)->GetTensorShape()->GetDimSizes(); + std::vector output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes(); + std::string reduction = ctx.GetAttr(AttrReduction)->GetString(); + if (output_dims != input_dims) { + KERNEL_LOG_ERROR( + "The data shape of the output need be the same as the input. output " + "shape [%s], input shape [%s]", + VectorToString(output_dims).c_str(), VectorToString(input_dims).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (target_dims != input_dims) { + KERNEL_LOG_ERROR( + "The data shape of the target need be the same as the input. target " + "shape [%s], input shape [%s]", + VectorToString(target_dims).c_str(), VectorToString(input_dims).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (reduction == "mean" || reduction == "sum" || reduction == "batchmean") { + if (ctx.Input(0)->NumElements() != 1) { + KERNEL_LOG_ERROR("The data num of the grad [%llu] must be 1", ctx.Input(0)->NumElements()); + return KERNEL_STATUS_PARAM_INVALID; + } + } else if (reduction == "none") { + if (input_dims != grad_dims) { + KERNEL_LOG_ERROR( + "The data shape of the grad need be the same as the input. grad " + "shape " + "[%s], input shape [%s]", + VectorToString(grad_dims).c_str(), VectorToString(input_dims).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + } + return KERNEL_STATUS_OK; +} + +uint32_t KlDivLossGradCpuKernel::Compute(CpuKernelContext &ctx) { + if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) { + return KERNEL_STATUS_PARAM_INVALID; + } + if (KlDivLossGradExtraCheck(ctx) == KERNEL_STATUS_PARAM_INVALID) { + return KERNEL_STATUS_PARAM_INVALID; + } + // choose compute function depend on dataType + auto data_type = static_cast(ctx.Input(kFirstInputIndex)->GetDataType()); + switch (data_type) { + case DT_FLOAT16: + return KlDivLossGradCompute(ctx); + case DT_FLOAT: + return KlDivLossGradCompute(ctx); + case DT_DOUBLE: + return KlDivLossGradCompute(ctx); + default: + KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(), + DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } +} + +template +uint32_t KlDivLossGradCpuKernel::KlDivLossGradCompute(CpuKernelContext &ctx) { + int64_t grad_total = ctx.Input(0)->NumElements(); + int64_t input_total = ctx.Input(1)->NumElements(); + int64_t target_total = ctx.Input(2)->NumElements(); + int64_t output_y_total = ctx.Output(0)->NumElements(); + int64_t total = input_total; + uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx); + T *grad = (T *)(ctx.Input(0)->GetData()); + T *input = (T *)(ctx.Input(1)->GetData()); + T *target = (T *)(ctx.Input(2)->GetData()); + T *output = (T *)(ctx.Output(0)->GetData()); + bool parallel_flag = false; + uint64_t data_size = ctx.Input(1)->GetDataSize(); + // Determine whether to enable multi-core parallel computing + if (data_size > DataDefaultParallelNum * sizeof(T)) { + parallel_flag = true; + } + // Eigen::Array + bool log_target{false}; + if (ctx.GetAttr(AttrLog) != nullptr) { + log_target = ctx.GetAttr(AttrLog)->GetBool(); + } + std::string reduction{"mean"}; + if (ctx.GetAttr(AttrReduction) != nullptr) { + reduction = ctx.GetAttr(AttrReduction)->GetString(); + } + if (cores == 0) { + KERNEL_LOG_ERROR("KlDivLossGrad compute failed."); + return KERNEL_STATUS_INNER_ERROR; + } + if (parallel_flag) { + const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor; + std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)}; + auto shard_kldivlossgrad = [&](std::int64_t begin, std::int64_t end) { + std::int64_t length = end - begin; + std::int64_t grad_begin{0}, grad_length{grad_total}; + if (reduction == "none") { + grad_begin = begin; + grad_length = length; + } + Eigen::Map > array_grad(grad + grad_begin, grad_length, 1); + Eigen::Map > array_input(input + begin, length, 1); + Eigen::Map > array_target(target + begin, length, 1); + Eigen::Map > array_output(output + begin, length, 1); + T constant_zero{0}; + array_output = constant_zero; + KlDivLossGradOp(array_target, array_grad, array_output, length, log_target, reduction); + if (reduction == "mean") { + array_output = array_output / T(output_y_total); + } else if (reduction == "batchmean") { + std::vector input_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes(); + array_output = array_output / T(input_dims[0]); + } + }; + KERNEL_HANDLE_ERROR(ParallelFor(ctx, total, per_unit_size, shard_kldivlossgrad), "KlDivLossGrad Compute failed."); + } else { + Eigen::Map > array_grad(grad, grad_total, 1); + Eigen::Map > array_input(input, input_total, 1); + Eigen::Map > array_target(target, target_total, 1); + Eigen::Map > array_output(output, output_y_total, 1); + T constant_zero{0}; + array_output = constant_zero; + KlDivLossGradOp(array_target, array_grad, array_output, output_y_total, log_target, reduction); + if (reduction == "mean") { + array_output = array_output / T(output_y_total); + } else if (reduction == "batchmean") { + std::vector input_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes(); + array_output = array_output / T(input_dims[0]); + } + } + return KERNEL_STATUS_OK; +} +REGISTER_CPU_KERNEL(kKlDivLossGrad, KlDivLossGradCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.h new file mode 100644 index 00000000000..9cdcf55c54e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/kldivlossgrad.h @@ -0,0 +1,42 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H +#define AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H +#define EIGEN_USE_THREADS +#define EIGEN_USE_SIMPLE_THREAD_POOL + +#include "cpu_ops_kernel.h" +#include "cpu_types.h" +#include "utils/bcast.h" + +namespace aicpu { +class KlDivLossGradCpuKernel : public CpuKernel { + public: + KlDivLossGradCpuKernel() = default; + ~KlDivLossGradCpuKernel() = default; + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + /** + * @brief compute for all types + * @param ctx cpu kernel context + * @return status if success + */ + template + uint32_t KlDivLossGradCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif // AICPU_KERNELS_NORMALIZED_KLDIVLOSSGRAD_H diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.cc new file mode 100644 index 00000000000..46b5f3f94a1 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.cc @@ -0,0 +1,173 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lcm.h" + +#include +#include + +#include "cpu_kernel_utils.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t kLcmOutputNum = 1; +const uint32_t kLcmInputNum = 2; +const char *kLcm = "Lcm"; +// when input data size is more than kParallelDataNum, use Parallel func +const int64_t kParallelDataNum = 2 * 1024; +const int64_t kParallelDataNumMid = 16 * 1024; +const int32_t kInput_32_32 = 3; +const int32_t kInput_32_64 = 2; +const int32_t kInput_64_32 = 1; +const int32_t kInput_64_64 = 0; +} // namespace + +namespace aicpu { +// Simple recursive gcd. +template +T elewise_gcd(T a, T b) { + if (b == 0) { + return a; + } + return elewise_gcd(b, a % b); +} +// Simple lcm. +template +T elewise_lcm(T a, T b) { + T gcd_tmp = elewise_gcd(a, b); + if (gcd_tmp == 0) { + return static_cast(0); + } + return std::abs(a / gcd_tmp * b); +} + +uint32_t LcmIOTypeCheck(CpuKernelContext &ctx, int32_t &dual_types) { + Tensor *x1 = ctx.Input(kFirstInputIndex); + Tensor *x2 = ctx.Input(kSecondInputIndex); + Tensor *y = ctx.Output(kFirstOutputIndex); + const std::set supported_types{DT_INT32, DT_INT64}; + auto x1_type = x1->GetDataType(); + auto x2_type = x2->GetDataType(); + auto y_type = y->GetDataType(); + KERNEL_CHECK_FALSE(supported_types.count(x1_type) != 0, KERNEL_STATUS_PARAM_INVALID, + "[Lcm] input x1 data type [%s] is not supported.", DTypeStr(x1_type).c_str()); + KERNEL_CHECK_FALSE(supported_types.count(x2_type) != 0, KERNEL_STATUS_PARAM_INVALID, + "[Lcm] input x2 data type [%s] is not supported.", DTypeStr(x2_type).c_str()); + int32_t x1_is_i32 = static_cast(x1_type == DT_INT32) << 1; + int32_t x2_is_i32 = static_cast(x2_type == DT_INT32); + int32_t _dual_types = x1_is_i32 | x2_is_i32; + switch (_dual_types) { + case kInput_64_64: + case kInput_64_32: + case kInput_32_64: + KERNEL_CHECK_FALSE(y_type == DT_INT64, KERNEL_STATUS_PARAM_INVALID, + "[Lcm] output y data type [%s] is not supported.", DTypeStr(y_type).c_str()); + dual_types = _dual_types; + break; + case kInput_32_32: + KERNEL_CHECK_FALSE(y_type == DT_INT32, KERNEL_STATUS_PARAM_INVALID, + "[Lcm] output y data type [%s] is not supported.", DTypeStr(y_type).c_str()); + dual_types = _dual_types; + break; + default: + KERNEL_LOG_ERROR("[Lcm] input data type tuple is not supported."); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LcmElewiseCompute(CpuKernelContext &ctx, const T1 *x1_ptr, const T2 *x2_ptr, T3 *y_ptr, Bcast &bcast) { + int64_t data_num = ctx.Output(kFirstOutputIndex)->NumElements(); + auto lcm_shard = [&](int64_t start, int64_t end) { + for (int64_t i = start; i < end; ++i) { + T3 x1_ele_abs = std::abs(static_cast(x1_ptr[bcast.GetBroadcastXIndex(i)])); + T3 x2_ele_abs = std::abs(static_cast(x2_ptr[bcast.GetBroadcastYIndex(i)])); + y_ptr[i] = elewise_lcm(x1_ele_abs, x2_ele_abs); + } + }; + if (data_num >= kParallelDataNum) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + if (data_num <= kParallelDataNumMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + if (max_core_num > data_num) { + max_core_num = data_num; + } + if (max_core_num == 0) { + KERNEL_LOG_ERROR("[Lcm] max_core_num is 0, please check the cpu num."); + return KERNEL_STATUS_PARAM_INVALID; + } + uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, lcm_shard); + if (ret != KERNEL_STATUS_OK) { + KERNEL_LOG_ERROR("[Lcm] Lcm Compute failed."); + return ret; + } + } else { + lcm_shard(0, data_num); + } + + return KERNEL_STATUS_OK; +} + +template +uint32_t LcmCompute(CpuKernelContext &ctx) { + Tensor *x1 = ctx.Input(kFirstInputIndex); + Tensor *x2 = ctx.Input(kSecondInputIndex); + Tensor *y = ctx.Output(kFirstOutputIndex); + const T1 *x1_ptr = reinterpret_cast(x1->GetData()); + const T2 *x2_ptr = reinterpret_cast(x2->GetData()); + T3 *y_ptr = reinterpret_cast(y->GetData()); + auto x1_shape = x1->GetTensorShape()->GetDimSizes(); + auto x2_shape = x2->GetTensorShape()->GetDimSizes(); + Bcast bcast(x1_shape, x2_shape); + if (bcast.IsValid()) { + return LcmElewiseCompute(ctx, x1_ptr, x2_ptr, y_ptr, bcast); + } else { + KERNEL_LOG_ERROR("[Lcm] broadcast failed."); + return KERNEL_STATUS_PARAM_INVALID; + } +} + +uint32_t LcmCpuKernel::Compute(CpuKernelContext &ctx) { + // check params + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kLcmInputNum, kLcmOutputNum), "[Lcm] check input and output number failed."); + int32_t dual_types = static_cast(-1); + KERNEL_HANDLE_ERROR(LcmIOTypeCheck(ctx, dual_types), "[Lcm] check data type failed."); + switch (dual_types) { + case kInput_64_64: + return LcmCompute(ctx); + break; + case kInput_64_32: + return LcmCompute(ctx); + break; + case kInput_32_64: + return LcmCompute(ctx); + break; + case kInput_32_32: + return LcmCompute(ctx); + break; + default: + KERNEL_LOG_ERROR("[Lcm] input data type tuple is not supported."); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kLcm, LcmCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.h new file mode 100644 index 00000000000..b71f37b18f5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lcm.h @@ -0,0 +1,32 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_LCM_H_ +#define AICPU_KERNELS_NORMALIZED_LCM_H_ + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" + +namespace aicpu { +class LcmCpuKernel : public CpuKernel { + public: + ~LcmCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.cc new file mode 100644 index 00000000000..5ab1093f2fe --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.cc @@ -0,0 +1,126 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All right reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "logit.h" + +#include "Eigen/Core" +#include "Eigen/Dense" +#include "Eigen/LU" +#include "cmath" +#include "cpu_context.h" +#include "cpu_kernel_utils.h" +#include "unsupported/Eigen/CXX11/Tensor" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t kOutputNum = 1; +const uint32_t kInputNum = 1; +const int64_t kParallelDataNumSameShape = 7 * 1024; +const int64_t kParallelDataNumSameShapeMid = 16 * 1024; +const char *kLogit = "Logit"; + +#define LOGIT_COMPUTE_CASE(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = LogitCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("Logit kernel compute failed."); \ + return result; \ + } \ + break; \ + } +} // namespace + +namespace aicpu { +uint32_t LogitCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLogit); + DataType data_type = ctx.Input(0)->GetDataType(); + switch (data_type) { + LOGIT_COMPUTE_CASE(DT_DOUBLE, double, ctx) + LOGIT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx) + LOGIT_COMPUTE_CASE(DT_FLOAT, float, ctx) + default: + KERNEL_LOG_ERROR("Logit kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LogitCpuKernel::LogitCompute(CpuKernelContext &ctx) { + auto input_tensor = ctx.Input(0); + auto output_tensor = ctx.Output(0); + auto input = reinterpret_cast(input_tensor->GetData()); + auto output = reinterpret_cast(output_tensor->GetData()); + AttrValue *attr = ctx.GetAttr("eps"); + float eps = -1.0; + if (attr != nullptr) { + eps = attr->GetFloat(); + } + auto input_shape = input_tensor->GetTensorShape(); + int64_t data_num = input_shape->NumElements(); + if (data_num >= kParallelDataNumSameShape) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx)); + if (data_num <= kParallelDataNumSameShapeMid) { + max_core_num = std::min(max_core_num, 4U); + } + if (max_core_num > data_num) { + max_core_num = data_num; + } + auto shared_less = [&](size_t start, size_t end) { + T one = T(1); + T up_bound = static_cast(1) - static_cast(eps); + if (eps < 0) { + for (size_t i = start; i < end; i++) { + T x = input[i]; + output[i] = log(x / (one - x)); + } + } else { + for (size_t i = start; i < end; i++) { + T z; + T x = input[i]; + z = x < static_cast(eps) ? static_cast(eps) : (x > up_bound ? up_bound : x); + output[i] = log(z / (one - z)); + } + } + }; + if (max_core_num == 0) { + KERNEL_LOG_ERROR("max core num is 0"); + return KERNEL_STATUS_PARAM_INVALID; + } + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_less), + "Logit Compute failed."); + } else { + T one = T(1); + T up_bound = static_cast(1) - static_cast(eps); + if (eps < 0) { + for (int64_t i = 0; i < data_num; i++) { + T x = input[i]; + output[i] = log(x / (one - x)); + } + } else { + for (int64_t i = 0; i < data_num; i++) { + T z; + T x = input[i]; + z = x < static_cast(eps) ? static_cast(eps) : (x > up_bound ? up_bound : x); + output[i] = log(z / (one - z)); + } + } + } + return KERNEL_STATUS_OK; +} +REGISTER_CPU_KERNEL(kLogit, LogitCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h new file mode 100644 index 00000000000..1df19e6d37d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_LOGIT_H +#define AICPU_KERNELS_NORMALIZED_LOGIT_H + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" + +namespace aicpu { +class LogitCpuKernel : public CpuKernel { + public: + LogitCpuKernel() = default; + ~LogitCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + uint32_t LogitCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.cc new file mode 100644 index 00000000000..2465c5fdea3 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.cc @@ -0,0 +1,133 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All right reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "logit_grad.h" + +#include "Eigen/Core" +#include "Eigen/Dense" +#include "Eigen/LU" +#include "cmath" +#include "cpu_context.h" +#include "cpu_kernel_utils.h" +#include "unsupported/Eigen/CXX11/Tensor" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t kOutputNum = 1; +const uint32_t kInputNum = 2; +const int64_t kParallelDataNumSameShape = 7 * 1024; +const int64_t kParallelDataNumSameShapeMid = 16 * 1024; +const char *kLogitGrad = "LogitGrad"; + +#define LOGITGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = LogitGradCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("LogitGrad kernel compute failed."); \ + return result; \ + } \ + break; \ + } +} // namespace + +namespace aicpu { +uint32_t LogitGradCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLogitGrad); + DataType data_type = ctx.Input(0)->GetDataType(); + switch (data_type) { + LOGITGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx) + LOGITGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx) + LOGITGRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx) + default: + KERNEL_LOG_ERROR("LogitGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LogitGradCpuKernel::LogitGradCompute(CpuKernelContext &ctx) { + auto input_y_grad_tensor = ctx.Input(0); + auto input_x_tensor = ctx.Input(1); + auto output_x_grad_tensor = ctx.Output(0); + auto input_y_grad = reinterpret_cast(input_y_grad_tensor->GetData()); + auto input_x = reinterpret_cast(input_x_tensor->GetData()); + auto output_x_grad = reinterpret_cast(output_x_grad_tensor->GetData()); + auto input_shape = input_x_tensor->GetTensorShape(); + int64_t data_num = input_shape->NumElements(); + float eps = -1.0; + AttrValue *attr = ctx.GetAttr("eps"); + if (attr != nullptr) { + eps = attr->GetFloat(); + } + if (data_num >= kParallelDataNumSameShape) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx)); + if (data_num <= kParallelDataNumSameShapeMid) { + max_core_num = std::min(max_core_num, 4U); + } + if (max_core_num > data_num) { + max_core_num = data_num; + } + auto shared_less = [&](size_t start, size_t end) { + T one = T(1); + T zero = T(0); + T up_bound = static_cast(1) - static_cast(eps); + if (eps < 0) { + for (size_t i = start; i < end; i++) { + T y_grad = input_y_grad[i]; + T x = input_x[i]; + output_x_grad[i] = (x < zero || x > one) ? std::numeric_limits::quiet_NaN() : (y_grad / (x * (one - x))); + } + } else { + for (size_t i = start; i < end; i++) { + T y_grad = input_y_grad[i]; + T x = input_x[i]; + output_x_grad[i] = + static_cast(x) < static_cast(eps) || static_cast(x) > static_cast(up_bound) + ? zero + : (y_grad / (x * (one - x))); + } + } + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_less), + "LogitGrad Compute failed."); + } else { + T one = T(1); + T zero = T(0); + T up_bound = static_cast(1) - static_cast(eps); + if (eps < 0) { + for (int64_t i = 0; i < data_num; i++) { + T y_grad = input_y_grad[i]; + T x = input_x[i]; + output_x_grad[i] = (x < zero || x > one) ? std::numeric_limits::quiet_NaN() : (y_grad / (x * (one - x))); + } + } else { + for (int64_t i = 0; i < data_num; i++) { + T y_grad = input_y_grad[i]; + T x = input_x[i]; + output_x_grad[i] = + static_cast(x) < static_cast(eps) || static_cast(x) > static_cast(up_bound) + ? zero + : (y_grad / (x * (one - x))); + } + } + } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kLogitGrad, LogitGradCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.h new file mode 100644 index 00000000000..60f8f3ea786 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit_grad.h @@ -0,0 +1,36 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_LOGIT_GRAD_H +#define AICPU_KERNELS_NORMALIZED_LOGIT_GRAD_H + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" + +namespace aicpu { +class LogitGradCpuKernel : public CpuKernel { + public: + LogitGradCpuKernel() = default; + ~LogitGradCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + uint32_t LogitGradCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.cc new file mode 100644 index 00000000000..aa430faf064 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.cc @@ -0,0 +1,153 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "lower_bound.h" + +#include "cpu_kernel_utils.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t kInputNum = 2; +const uint32_t kOutputNum = 1; +const char *kLowerBound = "LowerBound"; + +#define LOWERBOUND_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \ + case (DTYPE): { \ + uint32_t result = LowerBoundCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("LowerBound kernel compute failed."); \ + return result; \ + } \ + break; \ + } + +#define LOWERBOUND_COMPUTE_CASE_ALL(TYPE, CTX) \ + LOWERBOUND_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \ + LOWERBOUND_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \ + LOWERBOUND_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \ + LOWERBOUND_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \ + LOWERBOUND_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \ + LOWERBOUND_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \ + LOWERBOUND_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \ + LOWERBOUND_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \ + LOWERBOUND_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX) +} // namespace + +namespace aicpu { +uint32_t LowerBoundCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LowerBound check input and output number failed."); + Tensor *sorted_x_data = ctx.Input(0); + Tensor *values_data = ctx.Input(1); + Tensor *output_data = ctx.Output(0); + auto output_type = output_data->GetDataType(); + auto sorted_x_type = sorted_x_data->GetDataType(); + auto values_type = values_data->GetDataType(); + if (sorted_x_type != values_type) { + KERNEL_LOG_ERROR("Input[0] data type[%s] must be same with Input[1] data type[%s]", DTypeStr(sorted_x_type).c_str(), + DTypeStr(values_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + switch (output_type) { + case DT_INT32: + switch (sorted_x_type) { + LOWERBOUND_COMPUTE_CASE_ALL(int32_t, ctx) + default: + KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + break; + case DT_INT64: + switch (sorted_x_type) { + LOWERBOUND_COMPUTE_CASE_ALL(int64_t, ctx) + default: + KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + break; + default: + KERNEL_LOG_ERROR("Output data type[%s] not supported.", DTypeStr(output_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LowerBoundCpuKernel::LowerBoundCompute(CpuKernelContext &ctx) { + Tensor *sorted_x_data = ctx.Input(0); + auto sorted_x_data_addr = reinterpret_cast(sorted_x_data->GetData()); + auto sorted_x_data_shape = sorted_x_data->GetTensorShape(); + std::vector sorted_x_data_shape_dims = sorted_x_data_shape->GetDimSizes(); + Tensor *values_data = ctx.Input(1); + auto values_data_addr = reinterpret_cast(values_data->GetData()); + auto values_data_shape = values_data->GetTensorShape(); + int64_t values_data_num = values_data_shape->NumElements(); + std::vector values_data_shape_dims = values_data_shape->GetDimSizes(); + Tensor *output_data = ctx.Output(0); + auto output_data_addr = reinterpret_cast(output_data->GetData()); + if (sorted_x_data_shape_dims[0] != values_data_shape_dims[0]) { + KERNEL_LOG_ERROR("The number of rows of Input[0]:([%d]) should be consistent with that of Input[1]:([%d]).", + sorted_x_data_shape_dims[0], values_data_shape_dims[0]); + return KERNEL_STATUS_PARAM_INVALID; + } + int64_t sorted_x_data_column = sorted_x_data_shape_dims[1]; + int64_t values_data_column = values_data_shape_dims[1]; + if (values_data_num < 1024) { + for (int64_t i = 0; i < values_data_num; i++) { + int64_t seq_row = i / values_data_column; + int64_t low = seq_row * sorted_x_data_column; + int64_t up = (seq_row + 1) * sorted_x_data_column - 1; + int64_t mid; + while (low <= up) { + mid = (low + up) / 2; + if (values_data_addr[i] <= sorted_x_data_addr[mid]) { + up = mid - 1; + } else { + low = mid + 1; + } + } + output_data_addr[i] = low - seq_row * sorted_x_data_column; + } + } else { + uint32_t min_core_num = 1; + int64_t sum_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2); + if (sum_core_num > values_data_num) { + sum_core_num = values_data_num; + } + auto shard_compute = [&](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + int64_t seq_row = i / values_data_column; + int64_t low = seq_row * sorted_x_data_column; + int64_t up = (seq_row + 1) * sorted_x_data_column - 1; + int64_t mid; + while (low <= up) { + mid = (low + up) / 2; + if (values_data_addr[i] <= sorted_x_data_addr[mid]) { + up = mid - 1; + } else { + low = mid + 1; + } + } + output_data_addr[i] = low - seq_row * sorted_x_data_column; + } + }; + KERNEL_HANDLE_ERROR( + CpuKernelUtils::ParallelFor(ctx, values_data_num, values_data_num / sum_core_num, shard_compute), + "LowerBound Compute failed."); + } + return KERNEL_STATUS_OK; +} +REGISTER_CPU_KERNEL(kLowerBound, LowerBoundCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.h new file mode 100644 index 00000000000..93d35dbdebc --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lower_bound.h @@ -0,0 +1,35 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_LOWERBOUND_H_ +#define AICPU_KERNELS_NORMALIZED_LOWERBOUND_H_ + +#include "cpu_ops_kernel.h" + +namespace aicpu { +class LowerBoundCpuKernel : public CpuKernel { + public: + LowerBoundCpuKernel() = default; + ~LowerBoundCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + static uint32_t LowerBoundCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.cc new file mode 100644 index 00000000000..c38cfca920c --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.cc @@ -0,0 +1,115 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lstsq.h" +#include "cpu_kernel_utils.h" +#include "utils/kernel_util.h" +#include "utils/eigen_tensor.h" +#include +#include +#include +namespace { +const uint32_t kOutputNum = 1; +const uint32_t kInputNum = 2; +const char *kLstsq = "Lstsq"; +} // namespace +// namespace aicpu +namespace aicpu { +uint32_t LstsqCpuKernel::Compute(CpuKernelContext &ctx) { + // check params + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Lstsq check input and output number failed."); + Tensor *input_x0 = ctx.Input(0); + Tensor *input_x1 = ctx.Input(1); + Tensor *output = ctx.Output(0); + auto dims_0 = input_x0->GetTensorShape()->GetDims(); + auto dims_1 = input_x1->GetTensorShape()->GetDims(); + KERNEL_CHECK_FALSE((dims_0 == 2), KERNEL_STATUS_PARAM_INVALID, "Dimension of input[0] must be 2, but got[%zu].", + dims_0); + KERNEL_CHECK_FALSE(((dims_1 == 2) || (dims_1 == 1)), KERNEL_STATUS_PARAM_INVALID, + "Dimension of input[1] must be 2 or 1, but got[%zu].", dims_1); + auto shape_0 = input_x0->GetTensorShape(); + auto shape_1 = input_x1->GetTensorShape(); + KERNEL_CHECK_FALSE((shape_0->GetDimSize(0) == shape_1->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID, + "Lstsq shape_0[0] and shape_1[0] not equal.", shape_0->GetDimSize(0), shape_0->GetDimSize(1)); + AttrValue *I2_regularizer = ctx.GetAttr("l2_regularizer"); + AttrValue *fast = ctx.GetAttr("fast"); + KERNEL_CHECK_NULLPTR(I2_regularizer, KERNEL_STATUS_PARAM_INVALID, "Get l2_regularizer failed."); + KERNEL_CHECK_NULLPTR(fast, KERNEL_STATUS_PARAM_INVALID, "Get fast failed."); + KERNEL_LOG_DEBUG( + "LstsqCpuKernel[%s], inputx0: size[%llu];" + "inputx1: size[%llu], output: size[%llu].", + ctx.GetOpType().c_str(), input_x0->GetDataSize(), input_x1->GetDataSize(), output->GetDataSize()); + DataType data_type1 = ctx.Input(0)->GetDataType(); + DataType data_type2 = ctx.Input(1)->GetDataType(); + KERNEL_CHECK_FALSE((data_type1 == data_type2), KERNEL_STATUS_PARAM_INVALID, + "Lstsq input_0_dtype must be equal to input_1_dtype.", data_type1, data_type2); + switch (data_type1) { + case DT_FLOAT16: + return LstsqCompute(ctx); + case DT_FLOAT: + return LstsqCompute(ctx); + case DT_DOUBLE: + return LstsqCompute(ctx); + default: + KERNEL_LOG_ERROR("Lstsq kernel data type [%u] not support.", DTypeStr(data_type1).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LstsqCpuKernel::LstsqCompute(CpuKernelContext &ctx) { + Eigen::Index m = ctx.Input(0)->GetTensorShape()->GetDimSize(0); + Eigen::Index n = ctx.Input(0)->GetTensorShape()->GetDimSize(1); + Eigen::Index k = 1; + if (ctx.Input(1)->GetTensorShape()->GetDims() == 2) { + k = ctx.Input(1)->GetTensorShape()->GetDimSize(1); + } + + typedef Eigen::Matrix MartixXd; + MartixXd A(m, n); + MartixXd B(m, k); + + auto aptr = reinterpret_cast(ctx.Input(0)->GetData()); + auto bptr = reinterpret_cast(ctx.Input(1)->GetData()); + + for (int i = 0; i < m * n; i++) { + *(A.data() + i) = static_cast(*(aptr + i)); + } + for (int i = 0; i < m * k; i++) { + *(B.data() + i) = static_cast(*(bptr + i)); + } + + MartixXd result(n, k); + if (m >= n) { + result = A.colPivHouseholderQr().solve(B); + } else { + MartixXd A_Transpose = A.transpose(); + MartixXd temp = A * A_Transpose; + MartixXd tempI = temp.inverse(); + MartixXd x = A_Transpose * tempI; + MartixXd output = x * B; + result = output; + } + auto output_addr = reinterpret_cast(ctx.Output(0)->GetData()); + for (int i = 0; i < n * k; i++) { + *(output_addr + i) = static_cast(*(result.data() + i)); + } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kLstsq, LstsqCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.h new file mode 100644 index 00000000000..99c323f3765 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lstsq.h @@ -0,0 +1,37 @@ + +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_LSTSQ_H_ +#define AICPU_KERNELS_NORMALIZED_LSTSQ_H_ + +#include "cpu_ops_kernel.h" + +namespace aicpu { + +class LstsqCpuKernel : public CpuKernel { + public: + LstsqCpuKernel() = default; + ~LstsqCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + static uint32_t LstsqCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.cc new file mode 100644 index 00000000000..d1e70be960b --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.cc @@ -0,0 +1,185 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "lu_solve.h" +#include "cpu_kernel_utils.h" +#include "utils/kernel_util.h" +#include "utils/eigen_tensor.h" +#include +#include +namespace { +const uint32_t kOutputNum = 1; +const uint32_t kInputNum = 3; +const int64_t kParallelBatchNum1 = 50; +const int64_t kParallelBatchNum4 = 200; +const int64_t kParallelBatchNum8 = 500; +const int64_t kParallelBatchNumx = 1000; +const char *kLuSolve = "LuSolve"; +} // namespace +namespace aicpu { +uint32_t LuSolveCpuKernel::Compute(CpuKernelContext &ctx) { + // check params + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check LuSolve params failed."); + Tensor *input_0 = ctx.Input(0); + KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input0 data failed."); + Tensor *input_1 = ctx.Input(1); + KERNEL_CHECK_NULLPTR(input_1->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input1 data failed."); + Tensor *input_2 = ctx.Input(2); + KERNEL_CHECK_NULLPTR(input_2->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input2 data failed."); + Tensor *output = ctx.Output(0); + auto input_0_Shape = input_0->GetTensorShape(); + KERNEL_CHECK_NULLPTR(input_0_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_0_Shape failed.") + auto input_1_Shape = input_1->GetTensorShape(); + KERNEL_CHECK_NULLPTR(input_1_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_1_Shape failed.") + auto input_2_Shape = input_2->GetTensorShape(); + KERNEL_CHECK_NULLPTR(input_2_Shape, KERNEL_STATUS_PARAM_INVALID, "Get input_2_Shape failed.") + int32_t b_dims = input_0_Shape->GetDims(); + int32_t lu_dims = input_1_Shape->GetDims(); + int32_t pivots_dims = input_2_Shape->GetDims(); + std::vector b_dims_vector = input_0_Shape->GetDimSizes(); + std::vector lu_dims_vector = input_1_Shape->GetDimSizes(); + std::vector pivots_dims_vector = input_2_Shape->GetDimSizes(); + if (b_dims == lu_dims) { + for (int32_t i = 0; i <= b_dims - 2; i++) { + if (b_dims_vector[i] != lu_dims_vector[i]) { + KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!"); + return KERNEL_STATUS_PARAM_INVALID; + } + } + } else if (lu_dims > b_dims) { + for (int32_t i = 0; i < b_dims - 2; i++) { + if (b_dims_vector[i] != lu_dims_vector[lu_dims - b_dims + i]) { + KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!"); + return KERNEL_STATUS_PARAM_INVALID; + } + } + } else { + for (int32_t i = 0; i < lu_dims - 2; i++) { + if (lu_dims_vector[i] != b_dims_vector[b_dims - lu_dims + i]) { + KERNEL_LOG_ERROR("Incompatible matrix sizes for lu_solve!"); + return KERNEL_STATUS_PARAM_INVALID; + } + } + } + for (int32_t i = 0; i < pivots_dims; i++) { + if (lu_dims_vector[i] != pivots_dims_vector[i]) { + KERNEL_LOG_ERROR("batch dimension of LU_pivots doesn't match batch dimension of LU_data!"); + return KERNEL_STATUS_PARAM_INVALID; + } + } + auto data_type = ctx.Input(0)->GetDataType(); + KERNEL_LOG_DEBUG( + "LuSolveCpuKernel[%s], input_0: size[%llu], input_1: size[%llu], input_2: size[%llu]" + "output: size[%llu].", + ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), input_2->GetDataSize(), + output->GetDataSize()); + switch (data_type) { + case DT_FLOAT: + return LuSolveCompute(ctx); + case DT_FLOAT16: + return LuSolveCompute(ctx); + default: + KERNEL_LOG_ERROR("LuSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LuSolveCpuKernel::LuSolve(CpuKernelContext &ctx, T *b_working_ptr, T *lu_working_ptr, + int32_t *pivots_working_ptr, int64_t b_stride, int64_t a) { + auto output_y = reinterpret_cast(ctx.Output(0)->GetData()); + auto input_0_Shape = ctx.Input(0)->GetTensorShape(); + auto input_1_Shape = ctx.Input(1)->GetTensorShape(); + int32_t lu_dims = input_1_Shape->GetDims(); + int64_t lu_maxtrix_sizes = input_1_Shape->GetDimSize(lu_dims - 2); + int32_t b_dim = input_0_Shape->GetDims(); + int64_t b_m = input_0_Shape->GetDimSize(b_dim - 1); + typedef Eigen::Matrix MatrixXd; + MatrixXd matrix_b = Eigen::Map(b_working_ptr, lu_maxtrix_sizes, b_m); + MatrixXd matrix_A = Eigen::Map(lu_working_ptr, lu_maxtrix_sizes, lu_maxtrix_sizes); + for (int64_t i = 0; i < input_0_Shape->GetDimSize(b_dim - 2); i++) { + matrix_b.row(i).swap(matrix_b.row(*(pivots_working_ptr + i) - 1)); + } + MatrixXd L = matrix_A.template triangularView(); + MatrixXd U = matrix_A.template triangularView(); + MatrixXd result = (L * U).lu().solve(matrix_b); + for (int64_t m = 0; m < b_stride; m++) { + *(output_y + a * b_stride + m) = (T2) * (result.data() + m); + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LuSolveCpuKernel::LuSolveCompute(CpuKernelContext &ctx) { + auto input_x0 = reinterpret_cast(ctx.Input(0)->GetData()); + auto input_x1 = reinterpret_cast(ctx.Input(1)->GetData()); + auto input_x2 = reinterpret_cast(ctx.Input(2)->GetData()); + auto input_0_Shape = ctx.Input(0)->GetTensorShape(); + auto input_1_Shape = ctx.Input(1)->GetTensorShape(); + auto input_2_Shape = ctx.Input(2)->GetTensorShape(); + T *input_0 = new T[input_0_Shape->NumElements()]; + T *input_1 = new T[input_1_Shape->NumElements()]; + for (int64_t i = 0; i < input_0_Shape->NumElements(); i++) { + *(input_0 + i) = (T) * (input_x0 + i); + } + for (int64_t i = 0; i < input_1_Shape->NumElements(); i++) { + *(input_1 + i) = (T) * (input_x1 + i); + } + int32_t b_dims = input_0_Shape->GetDims(); + int32_t lu_dims = input_1_Shape->GetDims(); + std::vector b_dims_vector = input_0_Shape->GetDimSizes(); + std::vector lu_dims_vector = input_1_Shape->GetDimSizes(); + std::vector pivots_dims_vector = input_2_Shape->GetDimSizes(); + int64_t b_stride = input_0_Shape->GetDimSize(b_dims - 1) * input_0_Shape->GetDimSize(b_dims - 2); + int64_t lu_stride = input_1_Shape->GetDimSize(lu_dims - 1) * input_1_Shape->GetDimSize(lu_dims - 2); + int64_t pivots_stride = input_1_Shape->GetDimSize(lu_dims - 1); + std::vector b_shape = b_dims_vector; + std::vector lu_shape = lu_dims_vector; + for (size_t i = 0; i < 2; i++) { + b_shape.pop_back(); + lu_shape.pop_back(); + } + Bcast bcast(b_shape, lu_shape); + int64_t batch_num = ctx.Output(0)->NumElements() / b_stride; + if (batch_num < kParallelBatchNum1) { + for (int64_t i = 0; i < batch_num; i++) { + T *b_working_ptr = &input_0[bcast.GetBroadcastXIndex(i) * b_stride]; + T *lu_working_ptr = &input_1[bcast.GetBroadcastYIndex(i) * lu_stride]; + int32_t *pivots_working_ptr = &input_x2[bcast.GetBroadcastYIndex(i) * pivots_stride]; + LuSolve(ctx, b_working_ptr, lu_working_ptr, pivots_working_ptr, b_stride, i); + } + } else { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx)); + if (batch_num < kParallelBatchNumx) max_core_num = 8U; + if (batch_num < kParallelBatchNum8) max_core_num = 4U; + if (batch_num < kParallelBatchNum4) max_core_num = 2U; + std::cout << max_core_num << std::endl; + auto sharder = [&](int64_t start, int64_t end) { + for (int64_t i = start; i < end; i++) { + T *b_working_ptr = &input_0[bcast.GetBroadcastXIndex(i) * b_stride]; + T *lu_working_ptr = &input_1[bcast.GetBroadcastYIndex(i) * lu_stride]; + int32_t *pivots_working_ptr = &input_x2[bcast.GetBroadcastYIndex(i) * pivots_stride]; + LuSolve(ctx, b_working_ptr, lu_working_ptr, pivots_working_ptr, b_stride, i); + } + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_num, batch_num / max_core_num, sharder), + "LuSolve Compute failed."); + } + return KERNEL_STATUS_OK; +} +REGISTER_CPU_KERNEL(kLuSolve, LuSolveCpuKernel); +} // namespace aicpu \ No newline at end of file diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.h new file mode 100644 index 00000000000..0a98e8ac5a8 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_solve.h @@ -0,0 +1,22 @@ +#ifndef AICPU_KERNELS_NORMALIZED_LUSOLVE_H_ +#define AICPU_KERNELS_NORMALIZED_LUSOLVE_H_ + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" +namespace aicpu { + +class LuSolveCpuKernel : public CpuKernel { + public: + LuSolveCpuKernel() = default; + ~LuSolveCpuKernel() = default; + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + static uint32_t LuSolve(CpuKernelContext &ctx, T *b_working_ptr, T *lu_working_ptr, int32_t *pivots_working_ptr, + int64_t b_stride, int64_t i); + template + static uint32_t LuSolveCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.cc new file mode 100644 index 00000000000..cc6da406b8e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.cc @@ -0,0 +1,321 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "lu_unpack.h" +#include +#include +#include +#include +#include "cpu_context.h" +#include "cpu_ops_kernel.h" +#include "cpu_kernel_utils.h" +#include "cpu_tensor.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t kOutputNum = 3; +const uint32_t kInputNum = 2; +const uint32_t kFirstInputIndex = 0; +const uint32_t kSecondInputIndex = 1; +const uint32_t kFirstOutputIndex = 0; +const uint32_t kSecondOutputIndex = 1; +const uint32_t kThirdOutputIndex = 2; +const int32_t kLuDataMinRank = 2; +const int32_t kLuPivotsMinRank = 2; +const int64_t kParallelBatchNum = 70; +const char *kLuUnpack = "LuUnpack"; +} // namespace +namespace aicpu { +template +uint32_t LuUnpackCpuKernel::LuUnpack(CpuKernelContext &ctx, T_pivots *Lu_pivots_working_ptr, int64_t matrix_index, + T_data *P_eye) { + int32_t Lu_data_dims = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDims(); + int64_t Lu_data_dim1 = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSize(Lu_data_dims - 2); + int64_t Lu_data_dim2 = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSize(Lu_data_dims - 1); + int32_t Lu_pivots_dims = ctx.Input(kSecondInputIndex)->GetTensorShape()->GetDims(); + int64_t Lu_pivots_dim = ctx.Input(kSecondInputIndex)->GetTensorShape()->GetDimSize(Lu_pivots_dims - 1); + int64_t matrix_width = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes()[Lu_data_dims - 2]; + int64_t matrix_height = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes()[Lu_data_dims - 1]; + int64_t pivots_stride = Lu_data_dim1 * Lu_data_dim1; + int64_t L_stride = 0; + int64_t U_stride = 0; + if (Lu_data_dim1 > Lu_data_dim2) { + L_stride = Lu_data_dim1 * Lu_data_dim2; + U_stride = Lu_data_dim2 * Lu_data_dim2; + } else { + L_stride = Lu_data_dim1 * Lu_data_dim1; + U_stride = Lu_data_dim1 * Lu_data_dim2; + } + int64_t matrix_size = matrix_width * matrix_height; + using MatrixMap = Eigen::Map>; + MatrixMap input(reinterpret_cast(ctx.Input(kFirstInputIndex)->GetData()) + matrix_index * matrix_size, + matrix_width, matrix_height); + // Triu + if (matrix_width > matrix_height) { + MatrixMap output2(reinterpret_cast(ctx.Output(kThirdOutputIndex)->GetData()) + matrix_index * U_stride, + matrix_height, matrix_height); + T_data *MiddlePtr = new T_data[matrix_size]; + MatrixMap MiddleData(MiddlePtr, matrix_width, matrix_height); + MiddleData = input.template triangularView(); + output2 = MiddleData.block(0, 0, matrix_height, matrix_height); + delete[] MiddlePtr; + } else { + MatrixMap output2(reinterpret_cast(ctx.Output(kThirdOutputIndex)->GetData()) + matrix_index * U_stride, + matrix_width, matrix_height); + output2 = input.template triangularView(); + } + // Tril + if (matrix_height > matrix_width) { + MatrixMap output1(reinterpret_cast(ctx.Output(kSecondOutputIndex)->GetData()) + matrix_index * L_stride, + matrix_width, matrix_width); + T_data *MiddlePtr = new T_data[matrix_size]; + MatrixMap MiddleData(MiddlePtr, matrix_width, matrix_height); + MiddleData = input.template triangularView(); + output1 = MiddleData.block(0, 0, matrix_width, matrix_width); + delete[] MiddlePtr; + } else { + MatrixMap output1(reinterpret_cast(ctx.Output(kSecondOutputIndex)->GetData()) + matrix_index * L_stride, + matrix_width, matrix_height); + output1 = input.template triangularView(); + } + // Swap + std::vector final_order; + final_order.resize(Lu_data_dim1); + for (int i = 0; i < Lu_data_dim1; i++) { + final_order[i] = T_pivots(i); + } + for (T_pivots id = 0; id < Lu_pivots_dim; id++) { + int64_t perm_id = 0; + int64_t perm_pivots_id = 0; + for (int64_t i = 0; i < Lu_data_dim1; i++) { + if (id == final_order[i]) { + perm_id = i; + } + if (!((*(Lu_pivots_working_ptr + id) <= Lu_data_dim1) && (*(Lu_pivots_working_ptr + id) >= 1))) { + return KERNEL_STATUS_PARAM_INVALID; + } + if ((*(Lu_pivots_working_ptr + id) - 1) == final_order[i]) { + perm_pivots_id = i; + } + } + std::swap(final_order[perm_id], final_order[perm_pivots_id]); + } + // Index_select + auto output_y0 = reinterpret_cast(ctx.Output(kFirstOutputIndex)->GetData()); + int64_t indices_num = final_order.size(); + int64_t inner_size = Lu_data_dim1; + int64_t slice_size = inner_size * sizeof(T_data); + for (int64_t j = 0; j < indices_num; ++j) { + auto params_idx = final_order[j] * inner_size; + auto out_idx = j * inner_size; + memcpy(output_y0 + matrix_index * pivots_stride + out_idx, P_eye + params_idx, slice_size); + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LuUnpackCpuKernel::LuUnpackCompute(CpuKernelContext &ctx) { + Tensor *input0_tensor = ctx.Input(kFirstInputIndex); + Tensor *input1_tensor = ctx.Input(kSecondInputIndex); + auto input_0_Shape = input0_tensor->GetTensorShape(); + auto input_1_Shape = input1_tensor->GetTensorShape(); + int32_t Lu_data_dims = input_0_Shape->GetDims(); + int64_t Lu_data_dim1 = input_0_Shape->GetDimSize(Lu_data_dims - 2); + int64_t Lu_data_dim2 = input_0_Shape->GetDimSize(Lu_data_dims - 1); + int32_t Lu_pivots_dims = input_1_Shape->GetDims(); + int64_t Lu_pivots_dim = input_1_Shape->GetDimSize(Lu_pivots_dims - 1); + auto input_dim_size = input_0_Shape->GetDimSizes(); + auto input_x1 = reinterpret_cast(input1_tensor->GetData()); + + int32_t block_size = Lu_data_dim1 * Lu_data_dim1; + T_data *P_eye = new T_data[block_size]{}; + T_data num = static_cast(1); + for (int32_t i = 0; i < Lu_data_dim1; i++) { + *(P_eye + (Lu_data_dim1 + 1) * i) = num; + } + uint32_t check_status = 0; + int64_t Lu_data_stride = Lu_data_dim1 * Lu_data_dim2; + int64_t Lu_pivots_stride = Lu_pivots_dim; + int64_t batch_num = ctx.Input(0)->NumElements() / Lu_data_stride; + if (batch_num < kParallelBatchNum || Lu_data_dims == kLuDataMinRank) { + for (int64_t matrix_index = 0; matrix_index < batch_num; matrix_index++) { + T_pivots *Lu_pivots_working_ptr = input_x1 + matrix_index * Lu_pivots_stride; + check_status = LuUnpack(ctx, Lu_pivots_working_ptr, matrix_index, P_eye); + if (check_status == KERNEL_STATUS_PARAM_INVALID) { + return check_status; + } + } + } else { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx)); + if (max_core_num > batch_num) { + max_core_num = batch_num; + } + uint32_t parallel_status = 0; + auto sharder = [&](int64_t start, int64_t end) { + for (int64_t matrix_index = start; matrix_index < end; matrix_index++) { + T_pivots *Lu_pivots_working_ptr = input_x1 + matrix_index * Lu_pivots_stride; + if (LuUnpack(ctx, Lu_pivots_working_ptr, matrix_index, P_eye) == KERNEL_STATUS_OK) { + parallel_status = KERNEL_STATUS_OK; + } else { + parallel_status = KERNEL_STATUS_PARAM_INVALID; + break; + } + } + }; + if (max_core_num == 0) { + KERNEL_LOG_ERROR("max_core_num could not be 0."); + } + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_num, batch_num / max_core_num, sharder), + "LuUnpack Compute failed."); + if (parallel_status != KERNEL_STATUS_OK) { + return KERNEL_STATUS_PARAM_INVALID; + } + } + delete[] P_eye; + return KERNEL_STATUS_OK; +} + +void LuUnpackCpuKernel::SetMap() { + calls_[DT_INT8][DT_INT8] = LuUnpackCompute; + calls_[DT_INT8][DT_UINT8] = LuUnpackCompute; + calls_[DT_INT8][DT_INT16] = LuUnpackCompute; + calls_[DT_INT8][DT_INT32] = LuUnpackCompute; + calls_[DT_INT8][DT_INT64] = LuUnpackCompute; + + calls_[DT_INT16][DT_INT8] = LuUnpackCompute; + calls_[DT_INT16][DT_INT16] = LuUnpackCompute; + calls_[DT_INT16][DT_INT32] = LuUnpackCompute; + calls_[DT_INT16][DT_INT64] = LuUnpackCompute; + calls_[DT_INT16][DT_UINT8] = LuUnpackCompute; + + calls_[DT_INT32][DT_INT8] = LuUnpackCompute; + calls_[DT_INT32][DT_INT16] = LuUnpackCompute; + calls_[DT_INT32][DT_INT32] = LuUnpackCompute; + calls_[DT_INT32][DT_INT64] = LuUnpackCompute; + calls_[DT_INT32][DT_UINT8] = LuUnpackCompute; + + calls_[DT_INT64][DT_INT8] = LuUnpackCompute; + calls_[DT_INT64][DT_INT16] = LuUnpackCompute; + calls_[DT_INT64][DT_INT32] = LuUnpackCompute; + calls_[DT_INT64][DT_INT64] = LuUnpackCompute; + calls_[DT_INT64][DT_UINT8] = LuUnpackCompute; + + calls_[DT_FLOAT16][DT_INT8] = LuUnpackCompute; + calls_[DT_FLOAT16][DT_INT16] = LuUnpackCompute; + calls_[DT_FLOAT16][DT_INT32] = LuUnpackCompute; + calls_[DT_FLOAT16][DT_INT64] = LuUnpackCompute; + calls_[DT_FLOAT16][DT_UINT8] = LuUnpackCompute; + + calls_[DT_FLOAT][DT_INT8] = LuUnpackCompute; + calls_[DT_FLOAT][DT_INT16] = LuUnpackCompute; + calls_[DT_FLOAT][DT_INT32] = LuUnpackCompute; + calls_[DT_FLOAT][DT_INT64] = LuUnpackCompute; + calls_[DT_FLOAT][DT_UINT8] = LuUnpackCompute; + + calls_[DT_DOUBLE][DT_INT8] = LuUnpackCompute; + calls_[DT_DOUBLE][DT_INT16] = LuUnpackCompute; + calls_[DT_DOUBLE][DT_INT32] = LuUnpackCompute; + calls_[DT_DOUBLE][DT_INT64] = LuUnpackCompute; + calls_[DT_DOUBLE][DT_UINT8] = LuUnpackCompute; + + calls_[DT_UINT8][DT_INT8] = LuUnpackCompute; + calls_[DT_UINT8][DT_INT16] = LuUnpackCompute; + calls_[DT_UINT8][DT_INT32] = LuUnpackCompute; + calls_[DT_UINT8][DT_INT64] = LuUnpackCompute; + calls_[DT_UINT8][DT_UINT8] = LuUnpackCompute; +} + +uint32_t LuUnpackCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "LuUnpack check input and output number failed."); + Tensor *LU_data_ = ctx.Input(0); + Tensor *LU_pivots_ = ctx.Input(1); + std::shared_ptr LU_data_shape = LU_data_->GetTensorShape(); + std::shared_ptr LU_pivots_shape = LU_pivots_->GetTensorShape(); + int32_t LU_data_rank = LU_data_shape->GetDims(); + if (LU_data_rank < kLuDataMinRank) { + KERNEL_LOG_ERROR( + "The input dim size of LU_data must be at least 2-D, " + "while %d", + LU_data_rank); + return KERNEL_STATUS_PARAM_INVALID; + } + int32_t Lu_data_dims = LU_data_shape->GetDims(); + int64_t Lu_data_dim1 = LU_data_shape->GetDimSize(Lu_data_dims - 2); + int64_t Lu_data_dim2 = LU_data_shape->GetDimSize(Lu_data_dims - 1); + int32_t Lu_pivots_dims = LU_pivots_shape->GetDims(); + int64_t Lu_pivots_dim = LU_pivots_shape->GetDimSize(Lu_pivots_dims - 1); + if (Lu_pivots_dim != std::min(Lu_data_dim1, Lu_data_dim2)) { + KERNEL_LOG_ERROR( + "The last dimension of LU_pivots must be the same as the minimum value " + "of the last two dimensions of LU_data, " + "but got The last dimension of LU_pivots [%d], the minimum value of " + "the last two dimensions of LU_data: [%d]", + Lu_pivots_dim, std::min(Lu_data_dim1, Lu_data_dim2)); + return KERNEL_STATUS_PARAM_INVALID; + } + for (int32_t i = 0; i < Lu_pivots_dims - 1; i++) { + if (LU_data_shape->GetDimSize(i) != LU_pivots_shape->GetDimSize(i)) { + KERNEL_LOG_ERROR( + " LU_data's batch dimensions does not match LU_pivots's batch " + "dimensions."); + return KERNEL_STATUS_PARAM_INVALID; + } + } + DataType LU_data_dtype = static_cast(LU_data_->GetDataType()); + bool LU_data_dtype_flag = LU_data_dtype != DT_FLOAT16 && LU_data_dtype != DT_FLOAT && LU_data_dtype != DT_DOUBLE && + LU_data_dtype != DT_INT8 && LU_data_dtype != DT_UINT8 && LU_data_dtype != DT_INT16 && + LU_data_dtype != DT_INT32 && LU_data_dtype != DT_INT64; + if (LU_data_dtype_flag) { + KERNEL_LOG_ERROR( + "Op LuUnpack first input LU_data_type's data type should be of the " + "follows: " + "DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT16, " + "DT_FLOAT, DT_DOUBLE, " + "but this type is [%s].", + DTypeStr(LU_data_dtype).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + DataType LU_pivots_dtype = static_cast(LU_pivots_->GetDataType()); + bool LU_pivots_dtype_flag = LU_pivots_dtype != DT_INT8 && LU_pivots_dtype != DT_UINT8 && + LU_pivots_dtype != DT_INT16 && LU_pivots_dtype != DT_INT32 && LU_pivots_dtype != DT_INT64; + if (LU_pivots_dtype_flag) { + KERNEL_LOG_ERROR( + "Op LuUnpack second input LU_pivots_type's data type should be of the " + "follows: " + "DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, " + "but this type is [%s].", + DTypeStr(LU_pivots_dtype).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + SetMap(); + std::vector LU_data_type_vec = {DT_INT8, DT_UINT8, DT_INT16, DT_INT32, + DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}; + std::vector LU_pivots_type_vec = {DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64}; + for (uint64_t i = 0; i < LU_data_type_vec.size(); i++) { + for (uint64_t j = 0; j < LU_pivots_type_vec.size(); j++) { + if (LU_data_dtype == LU_data_type_vec[i] && LU_pivots_dtype == LU_pivots_type_vec[j]) { + KERNEL_HANDLE_ERROR(calls_[LU_data_type_vec[i]][LU_pivots_type_vec[j]](ctx), + "The elements of LU_pivots must be greater than 1 " + "and be less than the size of LU_pivots's last dimension."); + } + } + } + calls_.clear(); + return KERNEL_STATUS_OK; +} +REGISTER_CPU_KERNEL(kLuUnpack, LuUnpackCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.h new file mode 100644 index 00000000000..71a5071dd2c --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.h @@ -0,0 +1,40 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_LUUNPACK_H_ +#define AICPU_KERNELS_NORMALIZED_LUUNPACK_H_ + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" +namespace aicpu { +class LuUnpackCpuKernel : public CpuKernel { + public: + LuUnpackCpuKernel() = default; + ~LuUnpackCpuKernel() = default; + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + static uint32_t LuUnpack(CpuKernelContext &ctx, T_pivots *Lu_pivots_working_ptr, int64_t matrix_index, T_data *P_eye); + template + static uint32_t LuUnpackCompute(CpuKernelContext &ctx); + template + static uint32_t DataAndTypeCheck(CpuKernelContext &ctx); + std::map>> calls_; + void SetMap(); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.cc new file mode 100644 index 00000000000..fdde631167e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.cc @@ -0,0 +1,183 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "lu_unpack_grad.h" +#include +#include "Eigen/Core" +#include "cpu_kernel_utils.h" +#include "cpu_types.h" +#include "kernel_log.h" +#include "securec.h" +#include "status.h" +#include "utils/broadcast_iterator.h" +#include "utils/kernel_util.h" + +namespace { +const char *kLuUnpackGrad = "LuUnpackGrad"; +const int64_t kParallelBatchNum = 30; +const uint32_t kInputNum = 3; +const uint32_t kOutputNum = 2; +const uint32_t kInputFirst = 0; +const uint32_t kInputSecond = 1; +const uint32_t kInputThird = 2; +} // namespace + +namespace aicpu { +uint32_t LuUnpackGradCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Lu Unpack Grad check input and output number failed."); + // choose compute function depend on dataType + auto input_type = static_cast(ctx.Input(kInputThird)->GetDataType()); + switch (input_type) { + case DT_FLOAT16: + return LuUnpackGradCompute(ctx); + case DT_FLOAT: + return LuUnpackGradCompute(ctx); + case DT_DOUBLE: + return LuUnpackGradCompute(ctx); + case DT_INT8: + return LuUnpackGradCompute(ctx); + case DT_INT16: + return LuUnpackGradCompute(ctx); + case DT_INT32: + return LuUnpackGradCompute(ctx); + case DT_INT64: + return LuUnpackGradCompute(ctx); + case DT_UINT8: + return LuUnpackGradCompute(ctx); + default: + KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(), + DTypeStr(input_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LuUnpackGradCpuKernel::TriLU(CpuKernelContext &ctx, Tensor *L_grad_output, Tensor *U_grad_output, int64_t a) { + Tensor *L_grad = NULL; + Tensor *U_grad = NULL; + Tensor *LU_data = NULL; + L_grad = ctx.Input(kInputFirst); + U_grad = ctx.Input(kInputSecond); + LU_data = ctx.Input(kInputThird); + auto LU_data_shape = LU_data->GetTensorShape(); + int32_t LU_data_dims = LU_data_shape->GetDims(); + int64_t LU_data_height = LU_data_shape->GetDimSize(LU_data_dims - 2); + int64_t LU_data_width = LU_data_shape->GetDimSize(LU_data_dims - 1); + auto LU_dim_min = std::min(LU_data_height, LU_data_width); + auto input_U_shape = U_grad->GetTensorShape(); + auto input_U_dim_size = input_U_shape->GetDimSizes(); + auto input_U_dims = input_U_shape->GetDims(); + int64_t matrix_U_width = input_U_dim_size[input_U_dims - 2]; + int64_t matrix_U_height = input_U_dim_size[input_U_dims - 1]; + int64_t matrix_U_size = matrix_U_width * matrix_U_height; + auto input_L_shape = L_grad->GetTensorShape(); + auto input_L_dim_size = input_L_shape->GetDimSizes(); + using MatrixMap = Eigen::Map>; + auto input_L_dims = input_L_shape->GetDims(); + int64_t matrix_L_width = input_L_dim_size[input_L_dims - 2]; + int64_t matrix_L_height = input_L_dim_size[input_L_dims - 1]; + int64_t matrix_L_size = matrix_L_width * matrix_L_height; + int64_t output_stride = LU_data_height * LU_data_width; + + MatrixMap input_L(reinterpret_cast(L_grad->GetData()) + a * matrix_L_size, matrix_L_width, matrix_L_height); + MatrixMap input_U(reinterpret_cast(U_grad->GetData()) + a * matrix_U_size, matrix_U_width, matrix_U_height); + if (LU_data_width > LU_data_height) { + MatrixMap output_L(reinterpret_cast(L_grad_output->GetData()) + a * output_stride, LU_data_height, + LU_data_width); + T *MiddlePtr = new T[matrix_L_size]; + MatrixMap MiddleData(MiddlePtr, matrix_L_width, matrix_L_height); + MiddleData = input_L.template triangularView(); + for (auto i = 0; i < LU_data_height; i++) { + for (auto j = 0; j < LU_dim_min; j++) { + output_L(i, j) = MiddleData(i, j); + } + } + delete[] MiddlePtr; + } else { + MatrixMap output_L(reinterpret_cast(L_grad_output->GetData()) + a * output_stride, LU_data_height, + LU_data_width); + output_L = input_L.template triangularView(); + } + if (LU_data_height > LU_data_width) { + MatrixMap output_U(reinterpret_cast(U_grad_output->GetData()) + a * output_stride, LU_data_height, + LU_data_width); + T *MiddlePtr = new T[matrix_U_size]; + MatrixMap MiddleData(MiddlePtr, matrix_U_width, matrix_U_height); + MiddleData = input_U.template triangularView(); + for (auto i = 0; i < LU_dim_min; i++) { + for (auto j = i; j < LU_data_width; j++) { + output_U(i, j) = MiddleData(i, j); + } + } + delete[] MiddlePtr; + } else { + MatrixMap output_U(reinterpret_cast(U_grad_output->GetData()) + a * output_stride, LU_data_height, + LU_data_width); + output_U = input_U.template triangularView(); + } + return KERNEL_STATUS_OK; +} + +template +uint32_t LuUnpackGradCpuKernel::LuUnpackGradCompute(CpuKernelContext &ctx) { + Tensor *LU_data = NULL; + Tensor *L_grad_output = NULL; + Tensor *U_grad_output = NULL; + LU_data = ctx.Input(kInputThird); + L_grad_output = ctx.Output(0); + U_grad_output = ctx.Output(1); + + auto LU_data_shape = LU_data->GetTensorShape(); + int32_t LU_data_dims = LU_data_shape->GetDims(); + int64_t LU_data_elem_num = LU_data->NumElements(); + + int64_t LU_data_height = LU_data_shape->GetDimSize(LU_data_dims - 2); + int64_t LU_data_width = LU_data_shape->GetDimSize(LU_data_dims - 1); + int64_t LU_data_stride = LU_data_height * LU_data_width; + int64_t matrix_num = LU_data_elem_num / LU_data_stride; + + auto L_grad_output_data = reinterpret_cast(L_grad_output->GetData()); + auto U_grad_output_data = reinterpret_cast(U_grad_output->GetData()); + for (auto i = 0; i < LU_data_elem_num; i++) { + *(L_grad_output_data + i) = static_cast(0); + *(U_grad_output_data + i) = static_cast(0); + } + if (matrix_num < kParallelBatchNum) { + for (auto i = 0; i < matrix_num; i++) { + TriLU(ctx, L_grad_output, U_grad_output, i); + } + } else { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx)); + if (max_core_num > matrix_num) { + max_core_num = matrix_num; + } + auto sharder = [&](int64_t start, int64_t end) { + for (int64_t i = start; i < end; i++) { + TriLU(ctx, L_grad_output, U_grad_output, i); + } + }; + if (max_core_num == 0) { + KERNEL_LOG_ERROR("max_core_num could not be 0."); + } + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, sharder), + "LuUnpackGrad Compute failed."); + } + + return KERNEL_STATUS_OK; +} +REGISTER_CPU_KERNEL(kLuUnpackGrad, LuUnpackGradCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.h new file mode 100644 index 00000000000..c3cc1757ac7 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack_grad.h @@ -0,0 +1,40 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_LU_UNPACK_GRAD_H_ +#define AICPU_KERNELS_LU_UNPACK_GRAD_H_ + +#include "cpu_ops_kernel.h" + +namespace aicpu { +class LuUnpackGradCpuKernel : public CpuKernel { + public: + ~LuUnpackGradCpuKernel() = default; + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + /** + * @brief compute for all types + * @param ctx cpu kernel context + * @return status if success + */ + template + uint32_t LuUnpackGradCompute(CpuKernelContext &ctx); + + template + uint32_t TriLU(CpuKernelContext &ctx, Tensor *L_grad_output, Tensor *U_grad_output, int64_t a); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.cc new file mode 100644 index 00000000000..a8dcb6ec6b5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.cc @@ -0,0 +1,179 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "matmul.h" + +#include +#include "unsupported/Eigen/CXX11/Tensor" + +#include "utils/kernel_util.h" +#include "cpu_kernel_utils.h" +#include "kernel_log.h" +#include "status.h" + +using namespace std; + +namespace { +const char *kMatmul = "MatMul"; +} // namespace + +namespace aicpu { +template +uint32_t MatMulCpuKernel::AddCompute(CpuKernelContext &ctx, Bcast &bcast) { + auto in2 = reinterpret_cast(ctx.Input(2)->GetData()); + auto out = reinterpret_cast(ctx.Output(0)->GetData()); + int64_t data_num = ctx.Output(0)->NumElements(); + + for (int64_t i = 0; i < data_num; i++) { + auto input1 = in2 + bcast.GetBroadcastXIndex(i); // i-th value of input0 + auto input2 = out + bcast.GetBroadcastYIndex(i); // i-th value of input1 + *(out + i) = (*input1) + (*input2); + } + return KERNEL_STATUS_OK; +} + +template +uint32_t MatMulCpuKernel::BiasCompute(CpuKernelContext &ctx) { + auto input0_tensor = ctx.Input(0); + auto input2_tensor = ctx.Input(2); + auto input2_shape = input2_tensor->GetTensorShape()->GetDimSizes(); + auto output_tensor = ctx.Output(kFirstOutputIndex); + auto output_shape = output_tensor->GetTensorShape()->GetDimSizes(); + + KERNEL_CHECK_FALSE(input2_tensor->GetTensorShape()->GetDims() == 1, KERNEL_STATUS_PARAM_INVALID, + "Input[x3] must be a 1D tensor") + + DataType input0_data_type = input0_tensor->GetDataType(); + DataType input2_data_type = input2_tensor->GetDataType(); + KERNEL_CHECK_FALSE((input0_data_type == input2_data_type), KERNEL_STATUS_PARAM_INVALID, + "Input[x1] data type[%s] and input[x3] data type[%s] must be same", + DTypeStr(input0_data_type).c_str(), DTypeStr(input2_data_type).c_str()) + + Bcast bcast(input2_shape, output_shape); + if (!bcast.IsValid()) { + KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return AddCompute(ctx, bcast); +} + +template +uint32_t MatMulCpuKernel::MatMulCompute(CpuKernelContext &ctx) { + auto input0_tensor = ctx.Input(0); + auto input0_tensor_shape = input0_tensor->GetTensorShape(); + KERNEL_CHECK_FALSE((IsMatrix(input0_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID, + "Input[x1] must be a matrix") + + auto input1_tensor = ctx.Input(1); + auto input1_tensor_shape = input1_tensor->GetTensorShape(); + KERNEL_CHECK_FALSE((IsMatrix(input1_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID, + "Input[x2] must be a matrix") + + auto transpose_x1 = ctx.GetAttr("transpose_x1")->GetBool(); + auto transpose_x2 = ctx.GetAttr("transpose_x2")->GetBool(); + KERNEL_LOG_DEBUG( + "%s Attr[transpose_x1] value[%d], " + "Attr[transpose_x2] value[%d].", + kMatmul, transpose_x1, transpose_x2); + int32_t x1_dim = transpose_x1 ? 0 : 1; + int32_t x2_dim = transpose_x2 ? 1 : 0; + KERNEL_CHECK_FALSE((input0_tensor_shape->GetDimSize(x1_dim) == input1_tensor_shape->GetDimSize(x2_dim)), + KERNEL_STATUS_PARAM_INVALID, + "Matrix size incompatible, input[x1] dim[%d] value[%lld], " + "input[x2] dim[%d] value[%lld]", + x1_dim, input0_tensor_shape->GetDimSize(x1_dim), x2_dim, input1_tensor_shape->GetDimSize(x2_dim)) + + auto input0_shape = input0_tensor_shape->GetDimSizes(); + using MatrixMap = Eigen::Map>; + MatrixMap input0(reinterpret_cast(input0_tensor->GetData()), input0_shape[0], input0_shape[1]); + + auto input1_shape = input1_tensor_shape->GetDimSizes(); + MatrixMap input1(reinterpret_cast(input1_tensor->GetData()), input1_shape[0], input1_shape[1]); + + auto output_tensor = ctx.Output(kFirstOutputIndex); + auto output_shape = output_tensor->GetTensorShape()->GetDimSizes(); + MatrixMap output(reinterpret_cast(output_tensor->GetData()), output_shape[0], output_shape[1]); + if (transpose_x1) { + if (transpose_x2) { + output = input0.transpose() * input1.transpose(); + } else { + output = input0.transpose() * input1; + } + } else { + if (transpose_x2) { + output = input0 * input1.transpose(); + } else { + output = input0 * input1; + } + } + if (ctx.GetInputsSize() == 3) { + return BiasCompute(ctx); + } + return KERNEL_STATUS_OK; +} + +uint32_t MatMulCpuKernel::Compute(CpuKernelContext &ctx) { + // check params + uint32_t input_num = ctx.GetInputsSize(); + uint32_t output_num = ctx.GetOutputsSize(); + if ((input_num != 2 && input_num != 3) || output_num != 1) { + KERNEL_LOG_ERROR("The number of input or output parameters does not match."); + return KERNEL_STATUS_PARAM_INVALID; + } + auto input0_tensor = ctx.Input(0); + KERNEL_CHECK_NULLPTR(input0_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get input[x1] data failed", + ctx.GetOpType().c_str()) + + auto input1_tensor = ctx.Input(1); + auto input1_tensor_shape = input1_tensor->GetTensorShape(); + KERNEL_CHECK_NULLPTR(input1_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get input[x2] data failed", + ctx.GetOpType().c_str()) + + DataType input0_data_type = input0_tensor->GetDataType(); + DataType input1_data_type = input1_tensor->GetDataType(); + KERNEL_CHECK_FALSE((input0_data_type == input1_data_type), KERNEL_STATUS_PARAM_INVALID, + "Input[x1] data type[%s] and input[x2] data type[%s] must be same", + DTypeStr(input0_data_type).c_str(), DTypeStr(input1_data_type).c_str()) + KERNEL_LOG_DEBUG("%s op input[x1] data type is [%s].", kMatmul, DTypeStr(input0_data_type).c_str()); + uint32_t ret = KERNEL_STATUS_OK; + switch (input0_data_type) { + case DT_FLOAT: + ret = MatMulCompute(ctx); + break; + case DT_DOUBLE: + ret = MatMulCompute(ctx); + break; + case DT_FLOAT16: + ret = MatMulCompute(ctx); + break; + case DT_INT32: + ret = MatMulCompute(ctx); + break; + case DT_COMPLEX64: + ret = MatMulCompute>(ctx); + break; + case DT_COMPLEX128: + ret = MatMulCompute>(ctx); + break; + default: + KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(), + DTypeStr(input0_data_type).c_str()); + ret = KERNEL_STATUS_PARAM_INVALID; + } + return ret; +} + +REGISTER_CPU_KERNEL(kMatmul, MatMulCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.h new file mode 100644 index 00000000000..6a7cb1a826d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matmul.h @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_HOST_MATMUL_H_ +#define AICPU_KERNELS_HOST_MATMUL_H_ + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" + +namespace aicpu { +class MatMulCpuKernel : public CpuKernel { + public: + MatMulCpuKernel() = default; + ~MatMulCpuKernel() = default; + + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + uint32_t AddCompute(CpuKernelContext &ctx, Bcast &bcast); + template + uint32_t BiasCompute(CpuKernelContext &ctx); + template + uint32_t MatMulCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.cc new file mode 100644 index 00000000000..195c3ee4f67 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.cc @@ -0,0 +1,320 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "matrix_exp.h" + +#include +#include +#include +#include "cpu_kernel_utils.h" +#include "utils/kernel_util.h" + +namespace { +constexpr uint32_t kMatrixExpInputNum = 1; +constexpr uint32_t kMatrixExpOutputNum = 1; +constexpr uint32_t kIndexTwo = 2; +const int64_t paralled_data_size = 8 * 1024; +const char *kMatrixExp = "MatrixExp"; +constexpr int total_n_degs = 6; + +// Coefficients for computing taylor approximant of order 8. +constexpr double sqrt_177 = 0.1330413469565007072504e+2, x3 = 2. / 3.; +constexpr double x1 = x3 * ((1. + sqrt_177) / 88.), x2 = x3 * ((1. + sqrt_177) / 352.); +constexpr double x4 = (-271. + 29. * sqrt_177) / (315. * x3), x5 = (-11. + 11. * sqrt_177) / (1260. * x3); +constexpr double x6 = (-99. + 11. * sqrt_177) / (5040. * x3), x7 = (89. - sqrt_177) / (5040. * x3); +constexpr double y2 = (857. - 58. * sqrt_177) / 630.; + +template +using array2d = std::array, ROW>; + +// Coefficients for computing taylor approximant of order 12. +constexpr int num_prods_12 = 4; +array2d b12 = { + {{9.0198e-16, 0.46932117595418237389, -0.20099424927047284052, -0.04623946134063071740}, + {5.31597895759871264183, 1.19926790417132231573, 0.01179296240992997031, 0.01108844528519167989}, + {0.18188869982170434744, 0.05502798439925399070, 0.09351590770535414968, 0.00610700528898058230}, + {-2.0861320e-13, -0.13181061013830184015, -0.02027855540589259079, -0.00675951846863086359}}}; + +// Coefficients for computing taylor approximant of order 18. +constexpr int num_prods_18 = 5; +array2d b18 = { + {{0., -1.00365581030144618291e-01, -8.02924648241156932449e-03, -8.92138498045729985177e-04, 0.}, + {0., 3.97849749499645077844e-01, 1.36783778460411720168e+00, 4.98289622525382669416e-01, + -6.37898194594723280150e-04}, + {-1.09676396052962061844e+01, 1.68015813878906206114e+00, 5.71779846478865511061e-02, -6.98210122488052056106e-03, + 3.34975017086070470649e-05}, + {-9.04316832390810593223e-02, -6.76404519071381882256e-02, 6.75961301770459654925e-02, 2.95552570429315521194e-02, + -1.39180257516060693404e-05}, + {0., 0., -9.23364619367118555360e-02, -1.69364939002081722752e-02, -1.40086798182036094347e-05}}}; + +// Threshold for different order of taylor approximant. +constexpr std::array thetas_float = {1.192092800768788e-07, 5.978858893805233e-04, + 5.116619363445086e-02, 5.800524627688768e-01, + 1.461661507209034e+00, 3.010066362817634e+00}; + +// Threshold for different order of taylor approximant. +constexpr std::array thetas_double = {2.220446049250313e-16, 2.580956802971767e-08, + 3.397168839976962e-04, 4.991228871115323e-02, + 2.996158913811580e-01, 1.090863719290036e+00}; + +#define MATRIX_EXP_COMPUTE_CASE(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = MatrixExpCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("MatrixExp kernel compute failed."); \ + return result; \ + } \ + break; \ + } + +#define MATRIX_EXP_COMPUTE_DIFF_CASE(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = MatrixExpDiffTypeCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("MatrixExp kernel compute failed."); \ + return result; \ + } \ + break; \ + } +} // namespace + +namespace aicpu { +uint32_t MatrixExpCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMatrixExpInputNum, kMatrixExpOutputNum), + "[%s] check input and output number failed.", kMatrixExp); + KERNEL_HANDLE_ERROR(MatrixExpCheck(ctx), "[%s] check params failed.", kMatrixExp); + auto data_type = ctx.Input(0)->GetDataType(); + switch (data_type) { + MATRIX_EXP_COMPUTE_CASE(DT_FLOAT, float, ctx) + MATRIX_EXP_COMPUTE_CASE(DT_DOUBLE, double, ctx) + MATRIX_EXP_COMPUTE_CASE(DT_COMPLEX64, std::complex, ctx) + MATRIX_EXP_COMPUTE_CASE(DT_COMPLEX128, std::complex, ctx) + MATRIX_EXP_COMPUTE_DIFF_CASE(DT_FLOAT16, Eigen::half, ctx) + default: + KERNEL_LOG_ERROR("MatrixExp kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +uint32_t MatrixExpCpuKernel::MatrixExpCheck(CpuKernelContext &ctx) { + auto input_0 = ctx.Input(0); + std::vector shape_x = input_0->GetTensorShape()->GetDimSizes(); + size_t shape_size_x = shape_x.size(); + KERNEL_CHECK_FALSE((shape_size_x > 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 2, got [%zu].", + shape_size_x) + KERNEL_CHECK_FALSE((shape_x[shape_size_x - 1] > 0), KERNEL_STATUS_PARAM_INVALID, + "Input x's last dimension must be at least 1.") + KERNEL_CHECK_FALSE((shape_x[shape_size_x - kIndexTwo] == shape_x[shape_size_x - 1]), KERNEL_STATUS_PARAM_INVALID, + "Input x's last two dimensions must be equal, but are [%lld] and [%lld].", + shape_x[shape_size_x - kIndexTwo], shape_x[shape_size_x - 1]) + return KERNEL_STATUS_OK; +} + +template +void MatrixExpCpuKernel::MTaylorApproximant(const Eigen::MatrixBase &A, const Eigen::MatrixBase &I, + int order, Eigen::MatrixBase &E) { + constexpr int expension_order_1 = 1; + constexpr int expension_order_2 = 2; + constexpr int expension_order_4 = 4; + constexpr int expension_order_8 = 8; + constexpr int expension_order_12 = 12; + auto A2 = A * A; + auto A3 = A * A2; + if (order == expension_order_1) { + E = I + A; + } else if (order == expension_order_2) { + constexpr int A2_divisor = 2; + E = I + A + A2 / A2_divisor; + } else if (order == expension_order_4) { + constexpr int I_divisor = 2; + constexpr int A_divisor = 6; + constexpr int A2_divisor = 24; + E = I + A + A2 * (I / I_divisor + A / A_divisor + A2 / A2_divisor); + } else if (order == expension_order_8) { + auto A4 = A2 * (x1 * A + x2 * A2); + auto A8 = (x3 * A2 + A4) * (x4 * I + x5 * A + x6 * A2 + x7 * A4); + E = I + A + y2 * A2 + A8; + } else if (order == expension_order_12) { + auto q31 = b12[0][0] * I + b12[0][1] * A + b12[0][2] * A2 + b12[0][3] * A3; + auto q32 = b12[1][0] * I + b12[1][1] * A + b12[1][2] * A2 + b12[1][3] * A3; + auto q33 = b12[2][0] * I + b12[2][1] * A + b12[2][2] * A2 + b12[2][3] * A3; + auto q34 = b12[3][0] * I + b12[3][1] * A + b12[3][2] * A2 + b12[3][3] * A3; + auto q61 = q33 + q34 * q34; + E = q31 + (q32 + q61) * q61; + } else { + auto A6 = A3 * A3; + auto q31 = b18[0][0] * I + b18[0][1] * A + b18[0][2] * A2 + b18[0][3] * A3 + b18[0][4] * A6; + auto q61 = b18[1][0] * I + b18[1][1] * A + b18[1][2] * A2 + b18[1][3] * A3 + b18[1][4] * A6; + auto q62 = b18[2][0] * I + b18[2][1] * A + b18[2][2] * A2 + b18[2][3] * A3 + b18[2][4] * A6; + auto q63 = b18[3][0] * I + b18[3][1] * A + b18[3][2] * A2 + b18[3][3] * A3 + b18[3][4] * A6; + auto q64 = b18[4][0] * I + b18[4][1] * A + b18[4][2] * A2 + b18[4][3] * A3 + b18[4][4] * A6; + auto q91 = q31 * q64 + q63; + E = q61 + (q62 + q91) * q91; + } +} + +template +void MatrixExpCpuKernel::MexpImpl(const Eigen::MatrixBase &A, const Eigen::MatrixBase &I, + Eigen::MatrixBase &mexp, CpuKernelContext &ctx) { + const auto norm = A.cwiseAbs().colwise().sum().maxCoeff(); + constexpr std::array m_vals = {1, 2, 4, 8, 12, 18}; + constexpr int cut_deg = 2; + int64_t s = -1; + auto data_type = ctx.Input(0)->GetDataType(); + if (data_type == DT_FLOAT16 || data_type == DT_FLOAT || data_type == DT_COMPLEX64) { + for (int i = 0; i < total_n_degs - 1; i++) { + if (norm <= thetas_float[i]) { + MTaylorApproximant(A, I, m_vals[i], mexp); + break; + } + } + if (norm >= thetas_float[total_n_degs - cut_deg]) { + s = ceil(log2(norm / thetas_float[total_n_degs - 1])); + if (s <= 0) { + s = 0; + } + } + } else { + for (int i = 0; i < total_n_degs - 1; i++) { + if (norm <= thetas_double[i]) { + MTaylorApproximant(A, I, m_vals[i], mexp); + break; + } + } + if (norm >= thetas_double[total_n_degs - cut_deg]) { + s = ceil(log2(norm / thetas_double[total_n_degs - 1])); + if (s <= 0) { + s = 0; + } + } + } + if (s >= 0) { + const auto pow2s = pow(2, s); + const auto A_scaled = A / pow2s; + MTaylorApproximant(A_scaled, I, m_vals[total_n_degs - 1], mexp); + for (int k = 0; k < s; k++) { + mexp = mexp * mexp; + } + } +} + +template +uint32_t MatrixExpCpuKernel::MatrixExpCompute(CpuKernelContext &ctx) { + auto input_x = reinterpret_cast(ctx.Input(0)->GetData()); + auto output_y = reinterpret_cast(ctx.Output(0)->GetData()); + + std::vector shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + size_t shape_size = shape_x.size(); + int64_t m = shape_x[shape_size - 1]; + int64_t size_mm = m * m; + typedef Eigen::Matrix MatrixXd; + MatrixXd I(m, m); + I.setIdentity(); + int64_t matrix_num = ctx.Input(0)->NumElements() / size_mm; + int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T); + if (data_size <= paralled_data_size) { + for (int64_t i = 0; i < matrix_num; i++) { + Eigen::Map matrix_x(input_x + i * m * m, m, m); + Eigen::Map matrix_y(output_y + i * m * m, m, m); + if (matrix_x.size() > 0) { + MexpImpl(matrix_x, I, matrix_y, ctx); + } + } + } else { + uint32_t min_core_num = 1; + int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + if (max_core_num == 0) { + return KERNEL_STATUS_PARAM_INVALID; + } + if (max_core_num > matrix_num) { + max_core_num = matrix_num; + } + auto shard_work = [&](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + Eigen::Map matrix_x(input_x + i * m * m, m, m); + Eigen::Map matrix_y(output_y + i * m * m, m, m); + if (matrix_x.size() > 0) { + MexpImpl(matrix_x, I, matrix_y, ctx); + } + } + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_work), + "MatrixExp Compute failed."); + } + return KERNEL_STATUS_OK; +} + +void MatrixExpCpuKernel::TyepChangeForFp16(int64_t i, int64_t m, Eigen::half *input_x, Eigen::half *output_y, + CpuKernelContext &ctx) { + typedef Eigen::Matrix MatrixXd; + MatrixXd I(m, m); + (void)I.setIdentity(); + MatrixXd matrix_x(m, m); + MatrixXd matrix_y(m, m); + int64_t size_mm = m * m; + for (int p = 0; p < m; p++) { + for (int q = 0; q < m; q++) { + matrix_x(p, q) = static_cast(input_x[i * size_mm + p * m + q]); + } + } + if (matrix_x.size() > 0) { + MexpImpl(matrix_x, I, matrix_y, ctx); + } + for (int p = 0; p < m; p++) { + for (int q = 0; q < m; q++) { + output_y[i * size_mm + p * m + q] = static_cast(matrix_y(p, q)); + } + } +} + +template +uint32_t MatrixExpCpuKernel::MatrixExpDiffTypeCompute(CpuKernelContext &ctx) { + T *input_x = reinterpret_cast(ctx.Input(0)->GetData()); + auto output_y = reinterpret_cast(ctx.Output(0)->GetData()); + + std::vector shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + size_t shape_size = shape_x.size(); + int64_t m = shape_x[shape_size - 1]; + int64_t size_mm = m * m; + int64_t matrix_num = ctx.Input(0)->NumElements() / size_mm; + int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T); + if (data_size <= paralled_data_size) { + for (int64_t i = 0; i < matrix_num; i++) { + TyepChangeForFp16(i, m, input_x, output_y, ctx); + } + } else { + uint32_t min_core_num = 1; + int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + if (max_core_num == 0) { + return KERNEL_STATUS_PARAM_INVALID; + } + if (max_core_num > matrix_num) { + max_core_num = matrix_num; + } + auto shard_work = [&](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + TyepChangeForFp16(i, m, input_x, output_y, ctx); + } + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_work), + "MatrixExp Compute failed."); + } + // } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kMatrixExp, MatrixExpCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.h new file mode 100644 index 00000000000..cddf4440f18 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_exp.h @@ -0,0 +1,50 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_EXP_H_ +#define AICPU_KERNELS_NORMALIZED_MATRIX_EXP_H_ + +#include "cpu_ops_kernel.h" +#include "utils/eigen_tensor.h" +namespace aicpu { +class MatrixExpCpuKernel : public CpuKernel { + public: + MatrixExpCpuKernel() = default; + ~MatrixExpCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + uint32_t MatrixExpCheck(CpuKernelContext &ctx); + + template + void MTaylorApproximant(const Eigen::MatrixBase &A, const Eigen::MatrixBase &I, int order, + Eigen::MatrixBase &E); + + template + void MexpImpl(const Eigen::MatrixBase &A, const Eigen::MatrixBase &I, + Eigen::MatrixBase &mexp, CpuKernelContext &ctx); + + template + uint32_t MatrixExpCompute(CpuKernelContext &ctx); + + void TyepChangeForFp16(int64_t i, int64_t m, Eigen::half *input_x, Eigen::half *output_y, CpuKernelContext &ctx); + + template + uint32_t MatrixExpDiffTypeCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc new file mode 100644 index 00000000000..7c11d4a02df --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc @@ -0,0 +1,460 @@ +/** + * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unminimum required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "maximum.h" + +#include "Eigen/Dense" +#include "cmath" +#include "cpu_kernel_utils.h" +#include "iostream" +#include "unsupported/Eigen/CXX11/Tensor" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" +namespace { +const uint32_t kInputNum = 2; +const uint32_t kOutputNum = 1; +const char *kMaximum = "Maximum"; +// when input data size is more than kParallelDataNum, use Parallel func +const int64_t kParallelDataNum = 2 * 1024; +const int64_t kParallelDataNumMid = 16 * 1024; +const int64_t kParallelDataNumSameShape = 7 * 1024; +const int64_t kParallelDataNumSameShapeMid = 35 * 1024; + +#define MAXIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = MaximumCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("Maximum kernel compute failed."); \ + return result; \ + } \ + break; \ + } +} // namespace + +namespace aicpu { +uint32_t MaximumCpuKernel::Compute(CpuKernelContext &ctx) { + // check params + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Maximum check input and output number failed."); + KERNEL_HANDLE_ERROR(MaximumParamCheck(ctx), "Maximum check params failed."); + auto data_type = ctx.Input(0)->GetDataType(); + switch (data_type) { + MAXIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx) + MAXIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx) + MAXIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx) + MAXIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx) + MAXIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx) + default: + KERNEL_LOG_ERROR("Maximum kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + return KERNEL_STATUS_OK; +} + +uint32_t MaximumCpuKernel::MaximumParamCheck(CpuKernelContext &ctx) { + // the non null of input_0, input_1, output has been verified in NormalCheck + Tensor *input_0 = ctx.Input(0); + Tensor *input_1 = ctx.Input(1); + Tensor *output = ctx.Output(0); + DataType input0_type = input_0->GetDataType(); + DataType input1_type = input_1->GetDataType(); + KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID, + "The data type of input0 [%s] need be same with " + "input1 [%s].", + DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str()) + KERNEL_LOG_DEBUG( + "MaximumCpuKernel[%s], input0: size[%llu];" + "input1: size[%llu], output: size[%llu].", + ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize()); + return KERNEL_STATUS_OK; +} + +template +void MaximumCpuKernel::SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) { + auto input1 = reinterpret_cast(ctx.Input(0)->GetData()); + auto input2 = reinterpret_cast(ctx.Input(1)->GetData()); + auto output = reinterpret_cast(ctx.Output(0)->GetData()); + auto ignore_nan = false; + auto ignore_nan_attr = ctx.GetAttr("ignore_nan"); + ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool(); + for (int64_t i = start; i < end; ++i) { + if (ignore_nan == true && is_float16 == true) { + if (Eigen::numext::isnan(*(input1 + i))) { + *(output + i) = *(input2 + i); + } else if (Eigen::numext::isnan(*(input2 + i))) { + *(output + i) = *(input1 + i); + } else { + *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i); + } + } + if (ignore_nan == true && is_float16 == false) { + if (isnan(*(input1 + i))) { + *(output + i) = *(input2 + i); + } else if (isnan(*(input2 + i))) { + *(output + i) = *(input1 + i); + } else { + *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i); + } + } + if (ignore_nan == false && is_float16 == true) { + if (Eigen::numext::isnan(*(input1 + i))) { + *(output + i) = *(input1 + i); + } else if (Eigen::numext::isnan(*(input2 + i))) { + *(output + i) = *(input2 + i); + } else { + *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i); + } + } + if (ignore_nan == false && is_float16 == false) { + if (isnan(*(input1 + i))) { + *(output + i) = *(input1 + i); + } else if (isnan(*(input2 + i))) { + *(output + i) = *(input2 + i); + } else { + *(output + i) = *(input1 + i) > *(input2 + i) ? *(input1 + i) : *(input2 + i); + } + } + } +} + +template +void MaximumCpuKernel::SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) { + auto input1 = reinterpret_cast(ctx.Input(0)->GetData()); + auto input2 = reinterpret_cast(ctx.Input(1)->GetData()); + auto output = reinterpret_cast(ctx.Output(0)->GetData()); + auto ignore_nan = false; + auto ignore_nan_attr = ctx.GetAttr("ignore_nan"); + ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool(); + for (int64_t i = start; i < end; ++i) { + if (ignore_nan == true && is_float16 == true) { + if (Eigen::numext::isnan(*(input1))) { + *(output + i) = *(input2 + i); + } else if (Eigen::numext::isnan(*(input2 + i))) { + *(output + i) = *(input1); + } else { + *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i); + } + } + if (ignore_nan == true && is_float16 == false) { + if (isnan(*(input1))) { + *(output + i) = *(input2 + i); + } else if (isnan(*(input2 + i))) { + *(output + i) = *(input1); + } else { + *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i); + } + } + if (ignore_nan == false && is_float16 == true) { + if (Eigen::numext::isnan(*(input1))) { + *(output + i) = *(input1); + } else if (Eigen::numext::isnan(*(input2 + i))) { + *(output + i) = *(input2 + i); + } else { + *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i); + } + } + if (ignore_nan == false && is_float16 == false) { + if (isnan(*(input1))) { + *(output + i) = *(input1); + } else if (isnan(*(input2 + i))) { + *(output + i) = *(input2 + i); + } else { + *(output + i) = *input1 > *(input2 + i) ? *input1 : *(input2 + i); + } + } + } +} + +template +void MaximumCpuKernel::SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) { + auto input1 = reinterpret_cast(ctx.Input(0)->GetData()); + auto input2 = reinterpret_cast(ctx.Input(1)->GetData()); + auto output = reinterpret_cast(ctx.Output(0)->GetData()); + auto ignore_nan = false; + auto ignore_nan_attr = ctx.GetAttr("ignore_nan"); + ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool(); + for (int64_t i = start; i < end; ++i) { + if (ignore_nan == true && is_float16 == true) { + if (Eigen::numext::isnan(*(input1 + i))) { + *(output + i) = *(input2); + } else if (Eigen::numext::isnan(*(input2))) { + *(output + i) = *(input1 + i); + } else { + *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2; + } + } + if (ignore_nan == true && is_float16 == false) { + if (isnan(*(input1 + i))) { + *(output + i) = *(input2); + } else if (isnan(*(input2))) { + *(output + i) = *(input1 + i); + } else { + *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2; + } + } + if (ignore_nan == false && is_float16 == true) { + if (Eigen::numext::isnan(*(input1 + i))) { + *(output + i) = *(input1 + i); + } else if (Eigen::numext::isnan(*(input2))) { + *(output + i) = *(input2); + } else { + *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2; + } + } + if (ignore_nan == false && is_float16 == false) { + if (isnan(*(input1 + i))) { + *(output + i) = *(input1 + i); + } else if (isnan(*(input2))) { + *(output + i) = *(input2); + } else { + *(output + i) = *(input1 + i) > *input2 ? *(input1 + i) : *input2; + } + } + } +} + +template +void MaximumCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) { + bool is_float16 = false; + if (std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value) { + is_float16 = false; + } else { + is_float16 = true; + } + switch (type) { + case BcastShapeType::SAME_SHAPE: + SpecialComputeSameShape(start, end, ctx, is_float16); + break; + case BcastShapeType::X_ONE_ELEMENT: + SpecialComputeXOneElement(start, end, ctx, is_float16); + break; + case BcastShapeType::Y_ONE_ELEMENT: + SpecialComputeYOneElement(start, end, ctx, is_float16); + break; + default: + KERNEL_LOG_WARN("Invalid type [%d]", static_cast(type)); + break; + } +} + +template +uint32_t MaximumCpuKernel::NoBcastCompute(CpuKernelContext &ctx) { + int64_t in0_elements_nums = ctx.Input(0)->NumElements(); + int64_t in1_elements_nums = ctx.Input(1)->NumElements(); + int64_t data_num = ctx.Output(0)->NumElements(); + BcastShapeType type = in0_elements_nums == in1_elements_nums + ? BcastShapeType::SAME_SHAPE + : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT); + if (data_num >= kParallelDataNumSameShape) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + + if (data_num <= kParallelDataNumSameShapeMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + + if (max_core_num > data_num) { + max_core_num = data_num; + } + + auto sharder_fmax = [&](int64_t start, int64_t end) { SpecialCompute(type, start, end, ctx); }; + + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_fmax), + "Maximum Compute failed."); + } else { + SpecialCompute(type, 0, data_num, ctx); + } + + return KERNEL_STATUS_OK; +} + +template +void MaximumCpuKernel::BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, + bool is_float16) { + auto in0 = reinterpret_cast(ctx.Input(0)->GetData()); + auto in1 = reinterpret_cast(ctx.Input(1)->GetData()); + auto out = reinterpret_cast(ctx.Output(0)->GetData()); + auto ignore_nan = false; + auto ignore_nan_attr = ctx.GetAttr("ignore_nan"); + ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool(); + for (int64_t i = start; i < end; ++i) { + if (ignore_nan == true && is_float16 == true) { + if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == true && is_float16 == false) { + if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == false && is_float16 == true) { + if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == false && is_float16 == false) { + if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + } +} + +template +void MaximumCpuKernel::BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16) { + auto in0 = reinterpret_cast(ctx.Input(0)->GetData()); + auto in1 = reinterpret_cast(ctx.Input(1)->GetData()); + auto out = reinterpret_cast(ctx.Output(0)->GetData()); + auto ignore_nan = false; + auto ignore_nan_attr = ctx.GetAttr("ignore_nan"); + ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool(); + int64_t data_num = ctx.Output(0)->NumElements(); + for (int64_t i = 0; i < data_num; ++i) { + if (ignore_nan == true && is_float16 == true) { + if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == true && is_float16 == false) { + if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == false && is_float16 == true) { + if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == false && is_float16 == false) { + if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) > *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + } +} + +template +uint32_t MaximumCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) { + int64_t data_num = ctx.Output(0)->NumElements(); + bool is_float16 = false; + if (std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value) { + is_float16 = false; + } else { + is_float16 = true; + } + if (data_num >= kParallelDataNum) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + + if (data_num <= kParallelDataNumMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + + if (max_core_num > data_num) { + max_core_num = data_num; + } + + auto sharder_fmax = [&](int64_t start, int64_t end) { + BcastComputeMultiKernel(start, end, ctx, bcast, is_float16); + }; + + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_fmax), + "Maximum Compute failed."); + } else { + BcastComputeOneKernel(ctx, bcast, is_float16); + } + return KERNEL_STATUS_OK; +} + +template +uint32_t MaximumCpuKernel::MaximumCompute(CpuKernelContext &ctx) { + Tensor *input0_tensor = ctx.Input(0); + auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes(); + int64_t input0_elements_nums = input0_tensor->NumElements(); + + Tensor *input1_tensor = ctx.Input(1); + auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes(); + int64_t input1_elements_nums = input1_tensor->NumElements(); + + bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1); + if (isNeedBcast) { + return NoBcastCompute(ctx); + } else { + Bcast bcast(input0_shape, input1_shape); + if (!bcast.IsValid()) { + KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + return BcastCompute(ctx, bcast); + } + + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kMaximum, MaximumCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.h new file mode 100644 index 00000000000..f6822901850 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.h @@ -0,0 +1,63 @@ +/** + * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unminimum required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_MAXIMUM_H_ +#define AICPU_KERNELS_NORMALIZED_MAXIMUM_H_ + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" + +namespace aicpu { +class MaximumCpuKernel : public CpuKernel { + public: + MaximumCpuKernel() = default; + ~MaximumCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + uint32_t MaximumParamCheck(CpuKernelContext &ctx); + + template + void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx); + + template + void SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16); + + template + void SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16); + + template + void SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16); + + template + uint32_t NoBcastCompute(CpuKernelContext &ctx); + + template + uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast); + + template + void BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, bool is_float16); + + template + void BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16); + + template + uint32_t MaximumCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc new file mode 100644 index 00000000000..b69e37ffb72 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc @@ -0,0 +1,456 @@ +/** + * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unminimum required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minimum.h" + +#include "Eigen/Dense" +#include "cmath" +#include "cpu_kernel_utils.h" +#include "iostream" +#include "unsupported/Eigen/CXX11/Tensor" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" +namespace { +const uint32_t kInputNum = 2; +const uint32_t kOutputNum = 1; +const char *kMinimum = "Minimum"; +// when input data size is more than kParallelDataNum, use Parallel func +const int64_t kParallelDataNum = 2 * 1024; +const int64_t kParallelDataNumMid = 16 * 1024; +const int64_t kParallelDataNumSameShape = 7 * 1024; +const int64_t kParallelDataNumSameShapeMid = 35 * 1024; + +#define MINIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = MinimumCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("Minimum kernel compute failed."); \ + return result; \ + } \ + break; \ + } +} // namespace + +namespace aicpu { +uint32_t MinimumCpuKernel::Compute(CpuKernelContext &ctx) { + // check params + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Minimum check input and output number failed."); + KERNEL_HANDLE_ERROR(MinimumParamCheck(ctx), "Minimum check params failed."); + auto data_type = ctx.Input(0)->GetDataType(); + + switch (data_type) { + MINIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx) + MINIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx) + MINIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx) + MINIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx) + MINIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx) + default: + KERNEL_LOG_ERROR("Minimum kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + return KERNEL_STATUS_OK; +} + +uint32_t MinimumCpuKernel::MinimumParamCheck(CpuKernelContext &ctx) { + // the non null of input_0, input_1, output has been verified in NormalCheck + Tensor *input_0 = ctx.Input(0); + Tensor *input_1 = ctx.Input(1); + Tensor *output = ctx.Output(0); + DataType input0_type = input_0->GetDataType(); + DataType input1_type = input_1->GetDataType(); + KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID, + "The data type of input0 [%s] need be same with " + "input1 [%s].", + DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str()) + KERNEL_LOG_DEBUG( + "MinimumCpuKernel[%s], input0: size[%llu];" + "input1: size[%llu], output: size[%llu].", + ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize()); + return KERNEL_STATUS_OK; +} + +template +void MinimumCpuKernel::SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) { + auto input1 = reinterpret_cast(ctx.Input(0)->GetData()); + auto input2 = reinterpret_cast(ctx.Input(1)->GetData()); + auto output = reinterpret_cast(ctx.Output(0)->GetData()); + auto ignore_nan = false; + auto ignore_nan_attr = ctx.GetAttr("ignore_nan"); + ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool(); + for (int64_t i = start; i < end; ++i) { + if (ignore_nan == false && is_float16 == true) { + if (Eigen::numext::isnan(*(input1 + i))) { + *(output + i) = *(input1 + i); + } else if (Eigen::numext::isnan(*(input2 + i))) { + *(output + i) = *(input2 + i); + } else { + *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i); + } + } + if (ignore_nan == false && is_float16 == false) { + if (isnan(*(input1 + i))) { + *(output + i) = *(input1 + i); + } else if (isnan(*(input2 + i))) { + *(output + i) = *(input2 + i); + } else { + *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i); + } + } + if (ignore_nan == true && is_float16 == true) { + if (Eigen::numext::isnan(*(input1 + i))) { + *(output + i) = *(input2 + i); + } else if (Eigen::numext::isnan(*(input2 + i))) { + *(output + i) = *(input1 + i); + } else { + *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i); + } + } + if (ignore_nan == true && is_float16 == false) { + if (isnan(*(input1 + i))) { + *(output + i) = *(input2 + i); + } else if (isnan(*(input2 + i))) { + *(output + i) = *(input1 + i); + } else { + *(output + i) = *(input1 + i) < *(input2 + i) ? *(input1 + i) : *(input2 + i); + } + } + } +} +template +void MinimumCpuKernel::SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) { + auto input1 = reinterpret_cast(ctx.Input(0)->GetData()); + auto input2 = reinterpret_cast(ctx.Input(1)->GetData()); + auto output = reinterpret_cast(ctx.Output(0)->GetData()); + auto ignore_nan = false; + auto ignore_nan_attr = ctx.GetAttr("ignore_nan"); + ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool(); + for (int64_t i = start; i < end; ++i) { + if (ignore_nan == false && is_float16 == true) { + if (Eigen::numext::isnan(*input1)) { + *(output + i) = *input1; + } else if (Eigen::numext::isnan(*(input2 + i))) { + *(output + i) = *(input2 + i); + } else { + *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i); + } + } + if (ignore_nan == false && is_float16 == false) { + if (isnan(*input1)) { + *(output + i) = *input1; + } else if (isnan(*(input2 + i))) { + *(output + i) = *(input2 + i); + } else { + *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i); + } + } + if (ignore_nan == true && is_float16 == true) { + if (Eigen::numext::isnan(*input1)) { + *(output + i) = *(input2 + i); + } else if (Eigen::numext::isnan(*(input2 + i))) { + *(output + i) = *input1; + } else { + *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i); + } + } + if (ignore_nan == true && is_float16 == false) { + if (isnan(*input1)) { + *(output + i) = *(input2 + i); + } else if (isnan(*(input2 + i))) { + *(output + i) = *input1; + } else { + *(output + i) = *input1 < *(input2 + i) ? *input1 : *(input2 + i); + } + } + } +} +template +void MinimumCpuKernel::SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16) { + auto input1 = reinterpret_cast(ctx.Input(0)->GetData()); + auto input2 = reinterpret_cast(ctx.Input(1)->GetData()); + auto output = reinterpret_cast(ctx.Output(0)->GetData()); + auto ignore_nan = false; + auto ignore_nan_attr = ctx.GetAttr("ignore_nan"); + ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool(); + for (int64_t i = start; i < end; ++i) { + if (ignore_nan == false && is_float16 == true) { + if (Eigen::numext::isnan(*(input1 + i))) { + *(output + i) = *(input1 + i); + } else if (Eigen::numext::isnan(*input2)) { + *(output + i) = *input2; + } else { + *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2; + } + } + if (ignore_nan == false && is_float16 == false) { + if (isnan(*(input1 + i))) { + *(output + i) = *(input1 + i); + } else if (isnan(*input2)) { + *(output + i) = *input2; + } else { + *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2; + } + } + if (ignore_nan == true && is_float16 == true) { + if (Eigen::numext::isnan(*(input1 + i))) { + *(output + i) = *input2; + } else if (Eigen::numext::isnan(*input2)) { + *(output + i) = *(input1 + i); + } else { + *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2; + } + } + if (ignore_nan == true && is_float16 == false) { + if (isnan(*(input1 + i))) { + *(output + i) = *input2; + } else if (isnan(*input2)) { + *(output + i) = *(input1 + i); + } else { + *(output + i) = *(input1 + i) < *input2 ? *(input1 + i) : *input2; + } + } + } +} + +template +void MinimumCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) { + bool is_float16 = false; + if (std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value) { + is_float16 = false; + } else { + is_float16 = true; + } + switch (type) { + case BcastShapeType::SAME_SHAPE: + SpecialComputeSameShape(start, end, ctx, is_float16); + break; + case BcastShapeType::X_ONE_ELEMENT: + SpecialComputeXOneElement(start, end, ctx, is_float16); + break; + case BcastShapeType::Y_ONE_ELEMENT: + SpecialComputeYOneElement(start, end, ctx, is_float16); + break; + default: + KERNEL_LOG_WARN("Invalid type [%d]", static_cast(type)); + break; + } +} + +template +uint32_t MinimumCpuKernel::NoBcastCompute(CpuKernelContext &ctx) { + int64_t in0_elements_nums = ctx.Input(0)->NumElements(); + int64_t in1_elements_nums = ctx.Input(1)->NumElements(); + int64_t data_num = ctx.Output(0)->NumElements(); + BcastShapeType type = in0_elements_nums == in1_elements_nums + ? BcastShapeType::SAME_SHAPE + : (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT); + if (data_num >= kParallelDataNumSameShape) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + + if (data_num <= kParallelDataNumSameShapeMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + + if (max_core_num > data_num) { + max_core_num = data_num; + } + + auto sharder_minimum = [&](int64_t start, int64_t end) { SpecialCompute(type, start, end, ctx); }; + + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_minimum), + "Minimum Compute failed."); + } else { + SpecialCompute(type, 0, data_num, ctx); + } + + return KERNEL_STATUS_OK; +} +template +void MinimumCpuKernel::BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, + bool is_float16) { + auto in0 = reinterpret_cast(ctx.Input(0)->GetData()); + auto in1 = reinterpret_cast(ctx.Input(1)->GetData()); + auto out = reinterpret_cast(ctx.Output(0)->GetData()); + auto ignore_nan = false; + auto ignore_nan_attr = ctx.GetAttr("ignore_nan"); + ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool(); + for (int64_t i = start; i < end; ++i) { + if (ignore_nan == false && is_float16 == true) { + if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == false && is_float16 == false) { + if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == true && is_float16 == true) { + if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + + if (ignore_nan == true && is_float16 == false) { + if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + } +} + +template +void MinimumCpuKernel::BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16) { + auto in0 = reinterpret_cast(ctx.Input(0)->GetData()); + auto in1 = reinterpret_cast(ctx.Input(1)->GetData()); + auto out = reinterpret_cast(ctx.Output(0)->GetData()); + auto ignore_nan = false; + auto ignore_nan_attr = ctx.GetAttr("ignore_nan"); + ignore_nan = (ignore_nan_attr == nullptr) ? false : ignore_nan_attr->GetBool(); + int64_t data_num = ctx.Output(0)->NumElements(); + for (int64_t i = 0; i < data_num; ++i) { + if (ignore_nan == false && is_float16 == true) { + if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == false && is_float16 == false) { + if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == true && is_float16 == true) { + if (Eigen::numext::isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else if (Eigen::numext::isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + if (ignore_nan == true && is_float16 == false) { + if (isnan(*(in0 + bcast.GetBroadcastXIndex(i)))) { + *(out + i) = *(in1 + bcast.GetBroadcastYIndex(i)); + } else if (isnan(*(in1 + bcast.GetBroadcastYIndex(i)))) { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)); + } else { + *(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) < *(in1 + bcast.GetBroadcastYIndex(i)) + ? *(in0 + bcast.GetBroadcastXIndex(i)) + : *(in1 + bcast.GetBroadcastYIndex(i)); + } + } + } +} + +template +uint32_t MinimumCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) { + int64_t data_num = ctx.Output(0)->NumElements(); + bool is_float16 = false; + if (std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value) { + is_float16 = false; + } else { + is_float16 = true; + } + if (data_num >= kParallelDataNum) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + + if (data_num <= kParallelDataNumMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + + if (max_core_num > data_num) { + max_core_num = data_num; + } + + auto sharder_minimum = [&](int64_t start, int64_t end) { + BcastComputeMultiKernel(start, end, ctx, bcast, is_float16); + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_minimum), + "Minimum Compute failed."); + } else { + BcastComputeOneKernel(ctx, bcast, is_float16); + } + return KERNEL_STATUS_OK; +} + +template +uint32_t MinimumCpuKernel::MinimumCompute(CpuKernelContext &ctx) { + Tensor *input0_tensor = ctx.Input(0); + auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes(); + int64_t input0_elements_nums = input0_tensor->NumElements(); + + Tensor *input1_tensor = ctx.Input(1); + auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes(); + int64_t input1_elements_nums = input1_tensor->NumElements(); + bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1); + if (isNeedBcast) { + return NoBcastCompute(ctx); + } else { + Bcast bcast(input0_shape, input1_shape); + if (!bcast.IsValid()) { + KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + return BcastCompute(ctx, bcast); + } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kMinimum, MinimumCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.h new file mode 100644 index 00000000000..9b89a55308d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.h @@ -0,0 +1,63 @@ +/** + * Copyright(c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unminimum required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_MINIMUM_H_ +#define AICPU_KERNELS_NORMALIZED_MINIMUM_H_ + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" + +namespace aicpu { +class MinimumCpuKernel : public CpuKernel { + public: + MinimumCpuKernel() = default; + ~MinimumCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + uint32_t MinimumParamCheck(CpuKernelContext &ctx); + + template + void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx); + + template + void SpecialComputeSameShape(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16); + + template + void SpecialComputeXOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16); + + template + void SpecialComputeYOneElement(int64_t start, int64_t end, CpuKernelContext &ctx, bool is_float16); + + template + uint32_t NoBcastCompute(CpuKernelContext &ctx); + + template + uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast); + + template + void BcastComputeMultiKernel(int64_t start, int64_t end, CpuKernelContext &ctx, Bcast &bcast, bool is_float16); + + template + void BcastComputeOneKernel(CpuKernelContext &ctx, Bcast &bcast, bool is_float16); + + template + uint32_t MinimumCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.h index 8f2492338e1..5ed0bb09d64 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.h +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reciprocal.h @@ -1,5 +1,5 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + * Copyright 2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc index ef96d88f253..7dff4556716 100644 --- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc +++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc @@ -171,11 +171,8 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An mindspore::kKLDivOpName, mindspore::kKlDivLossGradOpName, mindspore::kLcmOpName, - mindspore::kLessEqualOpName, - mindspore::kLogicalXorOpName, mindspore::kLogitOpName, mindspore::kLogitGradOpName, - mindspore::kLogNormalReverseOpName, mindspore::kLowerBoundOpName, mindspore::kLstsqOpName, mindspore::kLuUnpackOpName,