diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.cc index 3dd000e0734..b4081ef3c96 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.cc @@ -1,116 +1,116 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h" - -namespace mindspore { -namespace kernel { -constexpr size_t kBceInputNumWithWeight = 3; - -template -void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) { - if (input_size % 2 == 1) { - tmp_loss[0] += tmp_loss[input_size - 1]; - } - - for (int stride = input_size / 2; stride > 0; stride = stride / 2) { - for (int i = 0; i < stride; i++) { - tmp_loss[i] += tmp_loss[i + stride]; - } - if (stride > 2 && stride % 2 == 1) { - tmp_loss[0] += tmp_loss[stride - 1]; - } - } - - loss[0] += tmp_loss[0]; - if (reduction == 1) { - loss[0] /= static_cast(input_size); - } -} - -template -void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector &inputs, - const std::vector &workspace, - const std::vector &outputs) { - T *input_x = reinterpret_cast(inputs[0]->addr); - T *input_y = reinterpret_cast(inputs[1]->addr); - T *weight = nullptr; - if (weight_defined_) { - weight = reinterpret_cast(inputs[2]->addr); - } - T *loss = reinterpret_cast(outputs[0]->addr); - std::vector tmp_loss(input_size_); - - T epsilon = static_cast(1e-12); - T one = static_cast(1); - if (reduction_ == 0 && weight_defined_) { - for (size_t i = 0; i < input_size_; i++) { - T value = - -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); - loss[i] = value; - } - } else if (reduction_ == 0 && (!weight_defined_)) { - for (size_t i = 0; i < input_size_; i++) { - T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); - loss[i] = value; - } - } else if ((reduction_ != 0) && weight_defined_) { - for (size_t i = 0; i < input_size_; i++) { - T value = - -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); - tmp_loss[i] = value; - } - } else { - for (size_t i = 0; i < input_size_; i++) { - T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); - tmp_loss[i] = value; - } - } - - if (reduction_ != 0) { - LaunchToScalar(input_size_, reduction_, loss, tmp_loss.data()); - } -} - -bool BinaryCrossEntropyCpuKernel::Launch(const std::vector &inputs, - const std::vector &workspace, - const std::vector &outputs) { - if (input_size_ > 0) { - if (dtype_ == kNumberTypeFloat32) { - Launchkernel(inputs, workspace, outputs); - } else if (dtype_ == kNumberTypeFloat16) { - Launchkernel(inputs, workspace, outputs); - } - } - return true; -} - -void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) { - auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); - for (size_t i = 0; i < input_shape.size(); i++) { - input_size_ *= input_shape[i]; - } - string reduction = AnfAlgo::GetNodeAttr(kernel_node, "reduction"); - if (reduction == "none") { - reduction_ = 0; - } else if (reduction == "sum") { - reduction_ = 2; - } - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - weight_defined_ = (input_num == kBceInputNumWithWeight); - dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h" + +namespace mindspore { +namespace kernel { +constexpr size_t kBceInputNumWithWeight = 3; + +template +void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) { + if (input_size % 2 == 1) { + tmp_loss[0] += tmp_loss[input_size - 1]; + } + + for (int stride = input_size / 2; stride > 0; stride = stride / 2) { + for (int i = 0; i < stride; i++) { + tmp_loss[i] += tmp_loss[i + stride]; + } + if (stride > 2 && stride % 2 == 1) { + tmp_loss[0] += tmp_loss[stride - 1]; + } + } + + loss[0] += tmp_loss[0]; + if (reduction == 1) { + loss[0] /= static_cast(input_size); + } +} + +template +void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs) { + T *input_x = reinterpret_cast(inputs[0]->addr); + T *input_y = reinterpret_cast(inputs[1]->addr); + T *weight = nullptr; + if (weight_defined_) { + weight = reinterpret_cast(inputs[2]->addr); + } + T *loss = reinterpret_cast(outputs[0]->addr); + std::vector tmp_loss(input_size_); + + T epsilon = static_cast(1e-12); + T one = static_cast(1); + if (reduction_ == 0 && weight_defined_) { + for (size_t i = 0; i < input_size_; i++) { + T value = + -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); + loss[i] = value; + } + } else if (reduction_ == 0 && (!weight_defined_)) { + for (size_t i = 0; i < input_size_; i++) { + T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); + loss[i] = value; + } + } else if ((reduction_ != 0) && weight_defined_) { + for (size_t i = 0; i < input_size_; i++) { + T value = + -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); + tmp_loss[i] = value; + } + } else { + for (size_t i = 0; i < input_size_; i++) { + T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); + tmp_loss[i] = value; + } + } + + if (reduction_ != 0) { + LaunchToScalar(input_size_, reduction_, loss, tmp_loss.data()); + } +} + +bool BinaryCrossEntropyCpuKernel::Launch(const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs) { + if (input_size_ > 0) { + if (dtype_ == kNumberTypeFloat32) { + Launchkernel(inputs, workspace, outputs); + } else if (dtype_ == kNumberTypeFloat16) { + Launchkernel(inputs, workspace, outputs); + } + } + return true; +} + +void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) { + auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + for (size_t i = 0; i < input_shape.size(); i++) { + input_size_ *= input_shape[i]; + } + string reduction = AnfAlgo::GetNodeAttr(kernel_node, "reduction"); + if (reduction == "none") { + reduction_ = 0; + } else if (reduction == "sum") { + reduction_ = 2; + } + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); + weight_defined_ = (input_num == kBceInputNumWithWeight); + dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h index 6d3abaa54e4..0e4e8f671d8 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h @@ -1,71 +1,71 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H - -#include -#include -#include "backend/kernel_compiler/cpu/cpu_kernel.h" -#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" - -namespace mindspore { -namespace kernel { -class BinaryCrossEntropyCpuKernel : public CPUKernel { - public: - BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {} - ~BinaryCrossEntropyCpuKernel() override = default; - - void InitKernel(const CNodePtr &kernel_node) override; - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - private: - template - void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss); - template - void Launchkernel(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs); - - TypeId dtype_{kTypeUnknown}; - size_t input_size_; - int reduction_; - bool weight_defined_; // true: there are 3 inputs, false: there are 2 inputs(no [weight]) -}; -MS_REG_CPU_KERNEL(BinaryCrossEntropy, - KernelAttr() - .AddInputAttr(kNumberTypeFloat16) - .AddInputAttr(kNumberTypeFloat16) - .AddInputAttr(kNumberTypeFloat16) - .AddOutputAttr(kNumberTypeFloat16), - BinaryCrossEntropyCpuKernel); -MS_REG_CPU_KERNEL(BinaryCrossEntropy, - KernelAttr() - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32), - BinaryCrossEntropyCpuKernel); -MS_REG_CPU_KERNEL( - BinaryCrossEntropy, - KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), - BinaryCrossEntropyCpuKernel); -MS_REG_CPU_KERNEL( - BinaryCrossEntropy, - KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), - BinaryCrossEntropyCpuKernel); -} // namespace kernel -} // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H + +#include +#include +#include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" + +namespace mindspore { +namespace kernel { +class BinaryCrossEntropyCpuKernel : public CPUKernel { + public: + BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {} + ~BinaryCrossEntropyCpuKernel() override = default; + + void InitKernel(const CNodePtr &kernel_node) override; + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + template + void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss); + template + void Launchkernel(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs); + + TypeId dtype_{kTypeUnknown}; + size_t input_size_; + int reduction_; + bool weight_defined_; // true: there are 3 inputs, false: there are 2 inputs(no [weight]) +}; +MS_REG_CPU_KERNEL(BinaryCrossEntropy, + KernelAttr() + .AddInputAttr(kNumberTypeFloat16) + .AddInputAttr(kNumberTypeFloat16) + .AddInputAttr(kNumberTypeFloat16) + .AddOutputAttr(kNumberTypeFloat16), + BinaryCrossEntropyCpuKernel); +MS_REG_CPU_KERNEL(BinaryCrossEntropy, + KernelAttr() + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32), + BinaryCrossEntropyCpuKernel); +MS_REG_CPU_KERNEL( + BinaryCrossEntropy, + KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), + BinaryCrossEntropyCpuKernel); +MS_REG_CPU_KERNEL( + BinaryCrossEntropy, + KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), + BinaryCrossEntropyCpuKernel); +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.cc index 49d6b4b84de..9613aa6616b 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.cc @@ -1,102 +1,102 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h" - -namespace mindspore { -namespace kernel { -constexpr size_t kBceGradInputNumWithWeight = 4; - -template -void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector &inputs, - const std::vector &outputs) { - T *input_x = reinterpret_cast(inputs[0]->addr); - T *input_y = reinterpret_cast(inputs[1]->addr); - T *dloss = reinterpret_cast(inputs[2]->addr); - T *weight = nullptr; - if (weight_defined_) { - weight = reinterpret_cast(inputs[3]->addr); - } - - T *dx = reinterpret_cast(outputs[0]->addr); - - T epsilon = static_cast(1e-12); - T one = static_cast(1); - if (reduction_ == 0) { - if (weight_defined_) { - for (size_t i = 0; i < input_size_; i++) { - T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; - T value = weight[i] * (input_x[i] - input_y[i]) / denominator; - dx[i] = value * dloss[i]; - } - } else { - for (size_t i = 0; i < input_size_; i++) { - T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; - T value = (input_x[i] - input_y[i]) / denominator; - dx[i] = value * dloss[i]; - } - } - } else { - T dloss1 = dloss[0]; - if (reduction_ == 1) { - dloss1 = dloss[0] / static_cast(input_size_); - } - if (weight_defined_) { - for (size_t i = 0; i < input_size_; i++) { - T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; - T value = weight[i] * (input_x[i] - input_y[i]) / denominator; - dx[i] = value * dloss1; - } - } else { - for (size_t i = 0; i < input_size_; i++) { - T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; - T value = (input_x[i] - input_y[i]) / denominator; - dx[i] = value * dloss1; - } - } - } -} - -bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector &inputs, - const std::vector &workspace, - const std::vector &outputs) { - if (input_size_ > 0) { - if (dtype_ == kNumberTypeFloat32) { - Launchkernel(inputs, outputs); - } else if (dtype_ == kNumberTypeFloat16) { - Launchkernel(inputs, outputs); - } - } - return true; -} - -void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) { - auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); - for (size_t i = 0; i < input_shape.size(); i++) { - input_size_ *= input_shape[i]; - } - string reduction = AnfAlgo::GetNodeAttr(kernel_node, "reduction"); - if (reduction == "none") { - reduction_ = 0; - } else if (reduction == "sum") { - reduction_ = 2; - } - - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - weight_defined_ = (input_num == kBceGradInputNumWithWeight); - dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h" + +namespace mindspore { +namespace kernel { +constexpr size_t kBceGradInputNumWithWeight = 4; + +template +void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector &inputs, + const std::vector &outputs) { + T *input_x = reinterpret_cast(inputs[0]->addr); + T *input_y = reinterpret_cast(inputs[1]->addr); + T *dloss = reinterpret_cast(inputs[2]->addr); + T *weight = nullptr; + if (weight_defined_) { + weight = reinterpret_cast(inputs[3]->addr); + } + + T *dx = reinterpret_cast(outputs[0]->addr); + + T epsilon = static_cast(1e-12); + T one = static_cast(1); + if (reduction_ == 0) { + if (weight_defined_) { + for (size_t i = 0; i < input_size_; i++) { + T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; + T value = weight[i] * (input_x[i] - input_y[i]) / denominator; + dx[i] = value * dloss[i]; + } + } else { + for (size_t i = 0; i < input_size_; i++) { + T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; + T value = (input_x[i] - input_y[i]) / denominator; + dx[i] = value * dloss[i]; + } + } + } else { + T dloss1 = dloss[0]; + if (reduction_ == 1) { + dloss1 = dloss[0] / static_cast(input_size_); + } + if (weight_defined_) { + for (size_t i = 0; i < input_size_; i++) { + T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; + T value = weight[i] * (input_x[i] - input_y[i]) / denominator; + dx[i] = value * dloss1; + } + } else { + for (size_t i = 0; i < input_size_; i++) { + T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; + T value = (input_x[i] - input_y[i]) / denominator; + dx[i] = value * dloss1; + } + } + } +} + +bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs) { + if (input_size_ > 0) { + if (dtype_ == kNumberTypeFloat32) { + Launchkernel(inputs, outputs); + } else if (dtype_ == kNumberTypeFloat16) { + Launchkernel(inputs, outputs); + } + } + return true; +} + +void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) { + auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + for (size_t i = 0; i < input_shape.size(); i++) { + input_size_ *= input_shape[i]; + } + string reduction = AnfAlgo::GetNodeAttr(kernel_node, "reduction"); + if (reduction == "none") { + reduction_ = 0; + } else if (reduction == "sum") { + reduction_ = 2; + } + + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); + weight_defined_ = (input_num == kBceGradInputNumWithWeight); + dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h index 95a506c6793..7aa253afb92 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h @@ -1,76 +1,76 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H - -#include -#include -#include "backend/kernel_compiler/cpu/cpu_kernel.h" -#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" - -namespace mindspore { -namespace kernel { -class BinaryCrossEntropyGradCpuKernel : public CPUKernel { - public: - BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {} - ~BinaryCrossEntropyGradCpuKernel() override = default; - - void InitKernel(const CNodePtr &kernel_node) override; - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - private: - template - void Launchkernel(const std::vector &inputs, const std::vector &outputs); - - TypeId dtype_{kTypeUnknown}; - size_t input_size_; - int reduction_; - bool weight_defined_; // true: there are 4 inputs, false: there are 3 inputs(no [weight]) -}; -MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, - KernelAttr() - .AddInputAttr(kNumberTypeFloat16) - .AddInputAttr(kNumberTypeFloat16) - .AddInputAttr(kNumberTypeFloat16) - .AddInputAttr(kNumberTypeFloat16) - .AddOutputAttr(kNumberTypeFloat16), - BinaryCrossEntropyGradCpuKernel); -MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, - KernelAttr() - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32), - BinaryCrossEntropyGradCpuKernel); -MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, - KernelAttr() - .AddInputAttr(kNumberTypeFloat16) - .AddInputAttr(kNumberTypeFloat16) - .AddInputAttr(kNumberTypeFloat16) - .AddOutputAttr(kNumberTypeFloat16), - BinaryCrossEntropyGradCpuKernel); -MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, - KernelAttr() - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32), - BinaryCrossEntropyGradCpuKernel); -} // namespace kernel -} // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H + +#include +#include +#include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" + +namespace mindspore { +namespace kernel { +class BinaryCrossEntropyGradCpuKernel : public CPUKernel { + public: + BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {} + ~BinaryCrossEntropyGradCpuKernel() override = default; + + void InitKernel(const CNodePtr &kernel_node) override; + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + template + void Launchkernel(const std::vector &inputs, const std::vector &outputs); + + TypeId dtype_{kTypeUnknown}; + size_t input_size_; + int reduction_; + bool weight_defined_; // true: there are 4 inputs, false: there are 3 inputs(no [weight]) +}; +MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, + KernelAttr() + .AddInputAttr(kNumberTypeFloat16) + .AddInputAttr(kNumberTypeFloat16) + .AddInputAttr(kNumberTypeFloat16) + .AddInputAttr(kNumberTypeFloat16) + .AddOutputAttr(kNumberTypeFloat16), + BinaryCrossEntropyGradCpuKernel); +MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, + KernelAttr() + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32), + BinaryCrossEntropyGradCpuKernel); +MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, + KernelAttr() + .AddInputAttr(kNumberTypeFloat16) + .AddInputAttr(kNumberTypeFloat16) + .AddInputAttr(kNumberTypeFloat16) + .AddOutputAttr(kNumberTypeFloat16), + BinaryCrossEntropyGradCpuKernel); +MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, + KernelAttr() + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32), + BinaryCrossEntropyGradCpuKernel); +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc index 5e55f128394..6be00ca19cb 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc @@ -1,271 +1,271 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "backend/kernel_compiler/cpu/cpu_kernel.h" -#include -#include -#include "common/thread_pool.h" - -namespace mindspore { -namespace kernel { -void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - for (size_t input_index = 0; input_index < input_num; ++input_index) { - TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index); - size_t type_size = GetTypeByte(TypeIdToType(type_id)); - std::vector shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index); - size_t tensor_size = - shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies()); - tensor_size = std::max(tensor_size, type_size); - input_size_list_.emplace_back(tensor_size); - } - size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - for (size_t output_index = 0; output_index < output_num; ++output_index) { - TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index); - size_t type_size = GetTypeByte(TypeIdToType(type_id)); - std::vector shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index); - size_t tensor_size = - shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies()); - tensor_size = std::max(tensor_size, type_size); - output_size_list_.emplace_back(tensor_size); - } -} - -void CPUKernel::Init(const CNodePtr &kernel_node) { - InitKernel(kernel_node); - InitInputOutputSize(kernel_node); -} - -void CPUKernelUtils::ExpandDimsTo4(std::vector *shape) { - auto len = shape->size(); - if (len < 4) { - for (size_t i = 0; i < 4 - len; ++i) { - shape->insert(shape->begin(), 1); - } - } -} - -size_t CPUKernelUtils::CalcOffset(const std::vector &shape, size_t dim0, size_t dim1, size_t dim2, - size_t dim3) { - size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3; - return offset; -} - -size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector &shape, int axis) { - if (axis < 0) { - axis = axis + SizeToInt(shape.size()); - } - size_t result = 1; - for (int j = 3; j > axis; --j) { - result *= shape[j]; - } - return result; -} - -void CPUKernelUtils::GetElementNumEveryDim(const std::vector &shape, std::vector *element_num) { - size_t accumulation = 1; - element_num->emplace_back(1); - for (size_t i = shape.size() - 1; i > 0; --i) { - accumulation *= shape[i]; - element_num->emplace_back(accumulation); - } - std::reverse(element_num->begin(), element_num->end()); -} - -void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) { - auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); - const float block_size = 128.0; - size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num; - std::vector tasks; - size_t start = 0; - size_t once_compute_size = (count + thread_num - 1) / thread_num; - while (start < count) { - size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size); - auto block = [&, start, end]() { - task(start, end); - return common::SUCCESS; - }; - tasks.emplace_back(block); - start += once_compute_size; - } - common::ThreadPool::GetInstance().SyncRun(tasks); -} - -std::vector CPUKernelUtils::FlatShapeByAxis(const std::vector &shape, int axis) { - if (axis < 0) { - axis = axis + SizeToInt(shape.size()); - } - size_t dim_row = 1; - size_t dim_col = 1; - std::vector flat_shape; - for (size_t i = 0; i < shape.size(); ++i) { - if (SizeToInt(i) < axis) { - dim_row *= shape[i]; - } else { - dim_col *= shape[i]; - } - } - flat_shape.push_back(dim_row); - flat_shape.push_back(dim_col); - return flat_shape; -} - -BroadcastIterator::BroadcastIterator(std::vector input_shape_a, std::vector input_shape_b, - std::vector output_shape) - : input_shape_a_(std::move(input_shape_a)), - input_shape_b_(std::move(input_shape_b)), - output_shape_(std::move(output_shape)) { - output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator - BroadcastShape(); - // Allocate strides memory - input_strides_a_.resize(output_dimension_); - input_strides_b_.resize(output_dimension_); - input_back_strides_a_.resize(output_dimension_); - input_back_strides_b_.resize(output_dimension_); - coordinates_.resize(output_dimension_); - InitStrides(); -} - -void BroadcastIterator::SetPos(size_t pos) { - for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) { - coordinates_[i] = pos % output_shape_[i]; - input_pos_[0] += coordinates_[i] * input_strides_a_[i]; - input_pos_[1] += coordinates_[i] * input_strides_b_[i]; - pos /= output_shape_[i]; - } -} - -void BroadcastIterator::GenNextPos() { - // Calculate output next coordinate - for (int i = output_dimension_ - 1; i >= 0; --i) { - if (coordinates_[i] + 1 == output_shape_[i]) { - coordinates_[i] = 0; - input_pos_[0] -= input_back_strides_a_[i]; - input_pos_[1] -= input_back_strides_b_[i]; - } else { - ++coordinates_[i]; - input_pos_[0] += input_strides_a_[i]; - input_pos_[1] += input_strides_b_[i]; - break; - } - } -} - -void BroadcastIterator::BroadcastShape() { - int input_dimension_a = input_shape_a_.size(); - if (input_dimension_a < output_dimension_) { - input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1); - } - - int input_dimension_b = input_shape_b_.size(); - if (input_dimension_b < output_dimension_) { - input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1); - } -} - -void BroadcastIterator::InitStrides() { - input_strides_a_[output_dimension_ - 1] = 1; - input_strides_b_[output_dimension_ - 1] = 1; - for (int i = output_dimension_ - 2; i >= 0; --i) { - input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1]; - input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1]; - input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1]; - input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1]; - } - - // Update strides for broadcast - // While the axis value is 1, the stride is 0 - std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(), - [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); - std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(), - [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); -} - -TransposeIterator::TransposeIterator(std::vector output_shape, std::vector axes, - const std::vector &input_shape) - : shape_(std::move(output_shape)), axes_(std::move(axes)) { - // Calculate strides - dimension_ = shape_.size(); - std::vector strides(dimension_, 1); - for (int i = dimension_ - 2; i >= 0; --i) { - strides[i] = input_shape[i + 1] * strides[i + 1]; - } - - // Swap shape ans strides and calculate back strides - strides_.resize(dimension_); - back_strides_.resize(dimension_); - for (int i = dimension_ - 1; i >= 0; --i) { - strides_[i] = strides[axes_[i]]; - back_strides_[i] = (shape_[i] - 1) * strides_[i]; - } - - // Calculate coordinate by pos - coordinates_.resize(dimension_); -} - -void TransposeIterator::SetPos(size_t pos) { - for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) { - coordinates_[i] = pos % shape_[i]; - pos_ += coordinates_[i] * strides_[i]; - pos /= shape_[i]; - } -} - -void TransposeIterator::GenNextPos() { - for (int i = dimension_ - 1; i >= 0; --i) { - if (coordinates_[i] + 1 == shape_[i]) { - coordinates_[i] = 0; - pos_ -= back_strides_[i]; - } else { - coordinates_[i]++; - pos_ += strides_[i]; - break; - } - } -} - -std::vector CPUKernelUtils::GetBroadcastShape(const std::vector &x, const std::vector &y) { - size_t x_len = x.size(); - size_t y_len = y.size(); - size_t length = x_len < y_len ? x_len : y_len; - std::vector broadcast_shape; - std::vector broadcast_shape_back; - for (int i = -length; i < 0; ++i) { - if (x[x_len + i] == 1) { - broadcast_shape_back.push_back(y[y_len + i]); - } else if (y[y_len + i] == 1) { - broadcast_shape_back.push_back(x[x_len + i]); - } else if (x[x_len + i] == y[y_len + i]) { - broadcast_shape_back.push_back(x[x_len + i]); - } - } - if (length == x_len) { - for (size_t i = 0; i < y_len - length; ++i) { - broadcast_shape.push_back(y[i]); - } - } else { - for (size_t i = 0; i < x_len - length; ++i) { - broadcast_shape.push_back(x[i]); - } - } - for (size_t i = 0; i < length; ++i) { - broadcast_shape.push_back(broadcast_shape_back[i]); - } - return broadcast_shape; -} - -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include +#include +#include "common/thread_pool.h" + +namespace mindspore { +namespace kernel { +void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); + for (size_t input_index = 0; input_index < input_num; ++input_index) { + TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index); + size_t type_size = GetTypeByte(TypeIdToType(type_id)); + std::vector shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index); + size_t tensor_size = + shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies()); + tensor_size = std::max(tensor_size, type_size); + input_size_list_.emplace_back(tensor_size); + } + size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); + for (size_t output_index = 0; output_index < output_num; ++output_index) { + TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index); + size_t type_size = GetTypeByte(TypeIdToType(type_id)); + std::vector shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index); + size_t tensor_size = + shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies()); + tensor_size = std::max(tensor_size, type_size); + output_size_list_.emplace_back(tensor_size); + } +} + +void CPUKernel::Init(const CNodePtr &kernel_node) { + InitKernel(kernel_node); + InitInputOutputSize(kernel_node); +} + +void CPUKernelUtils::ExpandDimsTo4(std::vector *shape) { + auto len = shape->size(); + if (len < 4) { + for (size_t i = 0; i < 4 - len; ++i) { + shape->insert(shape->begin(), 1); + } + } +} + +size_t CPUKernelUtils::CalcOffset(const std::vector &shape, size_t dim0, size_t dim1, size_t dim2, + size_t dim3) { + size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3; + return offset; +} + +size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector &shape, int axis) { + if (axis < 0) { + axis = axis + SizeToInt(shape.size()); + } + size_t result = 1; + for (int j = 3; j > axis; --j) { + result *= shape[j]; + } + return result; +} + +void CPUKernelUtils::GetElementNumEveryDim(const std::vector &shape, std::vector *element_num) { + size_t accumulation = 1; + element_num->emplace_back(1); + for (size_t i = shape.size() - 1; i > 0; --i) { + accumulation *= shape[i]; + element_num->emplace_back(accumulation); + } + std::reverse(element_num->begin(), element_num->end()); +} + +void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) { + auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); + const float block_size = 128.0; + size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num; + std::vector tasks; + size_t start = 0; + size_t once_compute_size = (count + thread_num - 1) / thread_num; + while (start < count) { + size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size); + auto block = [&, start, end]() { + task(start, end); + return common::SUCCESS; + }; + tasks.emplace_back(block); + start += once_compute_size; + } + common::ThreadPool::GetInstance().SyncRun(tasks); +} + +std::vector CPUKernelUtils::FlatShapeByAxis(const std::vector &shape, int axis) { + if (axis < 0) { + axis = axis + SizeToInt(shape.size()); + } + size_t dim_row = 1; + size_t dim_col = 1; + std::vector flat_shape; + for (size_t i = 0; i < shape.size(); ++i) { + if (SizeToInt(i) < axis) { + dim_row *= shape[i]; + } else { + dim_col *= shape[i]; + } + } + flat_shape.push_back(dim_row); + flat_shape.push_back(dim_col); + return flat_shape; +} + +BroadcastIterator::BroadcastIterator(std::vector input_shape_a, std::vector input_shape_b, + std::vector output_shape) + : input_shape_a_(std::move(input_shape_a)), + input_shape_b_(std::move(input_shape_b)), + output_shape_(std::move(output_shape)) { + output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator + BroadcastShape(); + // Allocate strides memory + input_strides_a_.resize(output_dimension_); + input_strides_b_.resize(output_dimension_); + input_back_strides_a_.resize(output_dimension_); + input_back_strides_b_.resize(output_dimension_); + coordinates_.resize(output_dimension_); + InitStrides(); +} + +void BroadcastIterator::SetPos(size_t pos) { + for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) { + coordinates_[i] = pos % output_shape_[i]; + input_pos_[0] += coordinates_[i] * input_strides_a_[i]; + input_pos_[1] += coordinates_[i] * input_strides_b_[i]; + pos /= output_shape_[i]; + } +} + +void BroadcastIterator::GenNextPos() { + // Calculate output next coordinate + for (int i = output_dimension_ - 1; i >= 0; --i) { + if (coordinates_[i] + 1 == output_shape_[i]) { + coordinates_[i] = 0; + input_pos_[0] -= input_back_strides_a_[i]; + input_pos_[1] -= input_back_strides_b_[i]; + } else { + ++coordinates_[i]; + input_pos_[0] += input_strides_a_[i]; + input_pos_[1] += input_strides_b_[i]; + break; + } + } +} + +void BroadcastIterator::BroadcastShape() { + int input_dimension_a = input_shape_a_.size(); + if (input_dimension_a < output_dimension_) { + input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1); + } + + int input_dimension_b = input_shape_b_.size(); + if (input_dimension_b < output_dimension_) { + input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1); + } +} + +void BroadcastIterator::InitStrides() { + input_strides_a_[output_dimension_ - 1] = 1; + input_strides_b_[output_dimension_ - 1] = 1; + for (int i = output_dimension_ - 2; i >= 0; --i) { + input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1]; + input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1]; + input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1]; + input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1]; + } + + // Update strides for broadcast + // While the axis value is 1, the stride is 0 + std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(), + [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); + std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(), + [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); +} + +TransposeIterator::TransposeIterator(std::vector output_shape, std::vector axes, + const std::vector &input_shape) + : shape_(std::move(output_shape)), axes_(std::move(axes)) { + // Calculate strides + dimension_ = shape_.size(); + std::vector strides(dimension_, 1); + for (int i = dimension_ - 2; i >= 0; --i) { + strides[i] = input_shape[i + 1] * strides[i + 1]; + } + + // Swap shape ans strides and calculate back strides + strides_.resize(dimension_); + back_strides_.resize(dimension_); + for (int i = dimension_ - 1; i >= 0; --i) { + strides_[i] = strides[axes_[i]]; + back_strides_[i] = (shape_[i] - 1) * strides_[i]; + } + + // Calculate coordinate by pos + coordinates_.resize(dimension_); +} + +void TransposeIterator::SetPos(size_t pos) { + for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) { + coordinates_[i] = pos % shape_[i]; + pos_ += coordinates_[i] * strides_[i]; + pos /= shape_[i]; + } +} + +void TransposeIterator::GenNextPos() { + for (int i = dimension_ - 1; i >= 0; --i) { + if (coordinates_[i] + 1 == shape_[i]) { + coordinates_[i] = 0; + pos_ -= back_strides_[i]; + } else { + coordinates_[i]++; + pos_ += strides_[i]; + break; + } + } +} + +std::vector CPUKernelUtils::GetBroadcastShape(const std::vector &x, const std::vector &y) { + size_t x_len = x.size(); + size_t y_len = y.size(); + size_t length = x_len < y_len ? x_len : y_len; + std::vector broadcast_shape; + std::vector broadcast_shape_back; + for (int i = -length; i < 0; ++i) { + if (x[x_len + i] == 1) { + broadcast_shape_back.push_back(y[y_len + i]); + } else if (y[y_len + i] == 1) { + broadcast_shape_back.push_back(x[x_len + i]); + } else if (x[x_len + i] == y[y_len + i]) { + broadcast_shape_back.push_back(x[x_len + i]); + } + } + if (length == x_len) { + for (size_t i = 0; i < y_len - length; ++i) { + broadcast_shape.push_back(y[i]); + } + } else { + for (size_t i = 0; i < x_len - length; ++i) { + broadcast_shape.push_back(x[i]); + } + } + for (size_t i = 0; i < length; ++i) { + broadcast_shape.push_back(broadcast_shape_back[i]); + } + return broadcast_shape; +} + +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h index 31a470eee1c..49d87e17439 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h @@ -1,205 +1,205 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ -#include -#include -#include -#include -#include -#include -#include "backend/kernel_compiler/kernel.h" -#include "backend/session/anf_runtime_algorithm.h" -#include "backend/kernel_compiler/common_utils.h" -#include "ir/anf.h" - -using mindspore::kernel::Address; -using mindspore::kernel::AddressPtr; -using CTask = std::function; -namespace mindspore { -namespace kernel { -const char KERNEL_SIZE[] = "kernel_size"; -const char STRIDE[] = "stride"; -const char STRIDES[] = "strides"; -const char DILATION[] = "dilation"; -const char DILATIONS[] = "dilations"; -const char FORMAT[] = "format"; -const char PAD[] = "pad"; -const char PAD_LIST[] = "pad_list"; -const char PAD_MODE[] = "pad_mode"; -const char PAD_MODE_LOWER_SAME[] = "same"; -const char PAD_MODE_LOWER_VALID[] = "valid"; -const char PAD_MODE_UPPER_SAME[] = "SAME"; -const char PAD_MODE_UPPER_VALID[] = "VALID"; -const char TRANSPOSE_A[] = "transpose_a"; -const char TRANSPOSE_B[] = "transpose_b"; -const char IS_GRAD[] = "is_grad"; -const char TRANSPOSE_NO = 'N'; -const char TRANSPOSE_YES = 'T'; -const char AXIS[] = "axis"; -const char DIM[] = "dim"; -const char BEGIN[] = "begin"; -const char END[] = "end"; -const char SIZE[] = "size"; -const char USE_NESTEROV[] = "use_nesterov"; -const char GROUP[] = "group"; -const char START[] = "start"; -const char LIMIT[] = "limit"; -const char DELTA[] = "delta"; -const char SORTED[] = "sorted"; -const char ADJ_ST[] = "adjoint_st"; -const char ADJ_dT[] = "adjoint_dt"; - -enum OperateType { - ADD = 0, - SUB, - MUL, - DIV, - SQUARE, - SQRT, - POW, - REALDIV, - FLOORDIV, - MOD, - FLOORMOD, - NEG, - LESS, - ASSIGNADD, - RELUGRAD, - RELU6GRAD, - ABSGRAD, - TANHGRAD, - SQRTGRAD, - SIGMOIDGRAD, - ONESLIKE, - ZEROSLIKE, - SIGN, - EQUAL, - NOTEQUAL, - LESSEQUAL, - LOGICALAND, - LOGICALOR, - LOGICALNOT, - FLOOR, - SQUAREDDIFFERENCE, - GREATER, - GREATEREQUAL, - RECIPROCAL, - GELU, - GELUGRAD, - ASIN, - ACOS, - ATAN, - ASINGRAD, - ACOSGRAD, - ATANGRAD, - SIN, - COS, - TAN, - SINH, - COSH, - ASINH, - ACOSH, - ATANH, - ASINHGRAD, - ACOSHGRAD, - ATAN2, - RINT, - ROUND, - IDENTITY, -}; - -class CPUKernel : public kernel::KernelMod { - public: - CPUKernel() = default; - ~CPUKernel() override = default; - virtual void Init(const CNodePtr &kernel_node); - virtual void InitKernel(const CNodePtr &kernel_node) = 0; - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs, void * /*stream_ptr*/) override { - return Launch(inputs, workspace, outputs); - }; - virtual bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) = 0; - const std::vector &GetInputSizeList() const override { return input_size_list_; } - const std::vector &GetOutputSizeList() const override { return output_size_list_; } - const std::vector &GetWorkspaceSizeList() const override { return workspace_size_list_; } - - protected: - virtual void InitInputOutputSize(const CNodePtr &kernel_node); - std::vector input_size_list_; - std::vector output_size_list_; - std::vector workspace_size_list_; -}; - -class CPUKernelUtils { - public: - static void ExpandDimsTo4(std::vector *shape); - static size_t CalcOffset(const std::vector &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3); - static size_t GetElementNumOnAxis(const std::vector &shape, int axis); - static void GetElementNumEveryDim(const std::vector &shape, std::vector *element_num); - static void ParallelFor(const CTask &task, size_t count); - static std::vector FlatShapeByAxis(const std::vector &shape, int axis); - static std::vector GetBroadcastShape(const std::vector &x, const std::vector &y); -}; - -class BroadcastIterator { - public: - BroadcastIterator(std::vector input_shape_a, std::vector input_shape_b, - std::vector output_shape); - virtual ~BroadcastIterator() = default; - inline size_t GetInputPosA() const { return input_pos_[0]; } - inline size_t GetInputPosB() const { return input_pos_[1]; } - void SetPos(size_t pos); - void GenNextPos(); - - private: - void BroadcastShape(); - void InitStrides(); - - std::vector coordinates_; - std::vector input_shape_a_; - std::vector input_shape_b_; - std::vector output_shape_; - std::vector input_strides_a_; - std::vector input_strides_b_; - std::vector input_back_strides_a_; - std::vector input_back_strides_b_; - std::array input_pos_{0}; - int output_dimension_{0}; -}; - -class TransposeIterator { - public: - TransposeIterator(std::vector output_shape, std::vector axes, const std::vector &input_shape); - virtual ~TransposeIterator() = default; - inline size_t GetPos() const { return pos_; } - void SetPos(size_t pos); - void GenNextPos(); - - private: - int dimension_{0}; - std::vector coordinates_; - std::vector shape_; - std::vector strides_; - std::vector back_strides_; - std::vector axes_; - size_t pos_{0}; -}; -} // namespace kernel -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ +#include +#include +#include +#include +#include +#include +#include "backend/kernel_compiler/kernel.h" +#include "backend/session/anf_runtime_algorithm.h" +#include "backend/kernel_compiler/common_utils.h" +#include "ir/anf.h" + +using mindspore::kernel::Address; +using mindspore::kernel::AddressPtr; +using CTask = std::function; +namespace mindspore { +namespace kernel { +const char KERNEL_SIZE[] = "kernel_size"; +const char STRIDE[] = "stride"; +const char STRIDES[] = "strides"; +const char DILATION[] = "dilation"; +const char DILATIONS[] = "dilations"; +const char FORMAT[] = "format"; +const char PAD[] = "pad"; +const char PAD_LIST[] = "pad_list"; +const char PAD_MODE[] = "pad_mode"; +const char PAD_MODE_LOWER_SAME[] = "same"; +const char PAD_MODE_LOWER_VALID[] = "valid"; +const char PAD_MODE_UPPER_SAME[] = "SAME"; +const char PAD_MODE_UPPER_VALID[] = "VALID"; +const char TRANSPOSE_A[] = "transpose_a"; +const char TRANSPOSE_B[] = "transpose_b"; +const char IS_GRAD[] = "is_grad"; +const char TRANSPOSE_NO = 'N'; +const char TRANSPOSE_YES = 'T'; +const char AXIS[] = "axis"; +const char DIM[] = "dim"; +const char BEGIN[] = "begin"; +const char END[] = "end"; +const char SIZE[] = "size"; +const char USE_NESTEROV[] = "use_nesterov"; +const char GROUP[] = "group"; +const char START[] = "start"; +const char LIMIT[] = "limit"; +const char DELTA[] = "delta"; +const char SORTED[] = "sorted"; +const char ADJ_ST[] = "adjoint_st"; +const char ADJ_dT[] = "adjoint_dt"; + +enum OperateType { + ADD = 0, + SUB, + MUL, + DIV, + SQUARE, + SQRT, + POW, + REALDIV, + FLOORDIV, + MOD, + FLOORMOD, + NEG, + LESS, + ASSIGNADD, + RELUGRAD, + RELU6GRAD, + ABSGRAD, + TANHGRAD, + SQRTGRAD, + SIGMOIDGRAD, + ONESLIKE, + ZEROSLIKE, + SIGN, + EQUAL, + NOTEQUAL, + LESSEQUAL, + LOGICALAND, + LOGICALOR, + LOGICALNOT, + FLOOR, + SQUAREDDIFFERENCE, + GREATER, + GREATEREQUAL, + RECIPROCAL, + GELU, + GELUGRAD, + ASIN, + ACOS, + ATAN, + ASINGRAD, + ACOSGRAD, + ATANGRAD, + SIN, + COS, + TAN, + SINH, + COSH, + ASINH, + ACOSH, + ATANH, + ASINHGRAD, + ACOSHGRAD, + ATAN2, + RINT, + ROUND, + IDENTITY, +}; + +class CPUKernel : public kernel::KernelMod { + public: + CPUKernel() = default; + ~CPUKernel() override = default; + virtual void Init(const CNodePtr &kernel_node); + virtual void InitKernel(const CNodePtr &kernel_node) = 0; + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs, void * /*stream_ptr*/) override { + return Launch(inputs, workspace, outputs); + }; + virtual bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) = 0; + const std::vector &GetInputSizeList() const override { return input_size_list_; } + const std::vector &GetOutputSizeList() const override { return output_size_list_; } + const std::vector &GetWorkspaceSizeList() const override { return workspace_size_list_; } + + protected: + virtual void InitInputOutputSize(const CNodePtr &kernel_node); + std::vector input_size_list_; + std::vector output_size_list_; + std::vector workspace_size_list_; +}; + +class CPUKernelUtils { + public: + static void ExpandDimsTo4(std::vector *shape); + static size_t CalcOffset(const std::vector &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3); + static size_t GetElementNumOnAxis(const std::vector &shape, int axis); + static void GetElementNumEveryDim(const std::vector &shape, std::vector *element_num); + static void ParallelFor(const CTask &task, size_t count); + static std::vector FlatShapeByAxis(const std::vector &shape, int axis); + static std::vector GetBroadcastShape(const std::vector &x, const std::vector &y); +}; + +class BroadcastIterator { + public: + BroadcastIterator(std::vector input_shape_a, std::vector input_shape_b, + std::vector output_shape); + virtual ~BroadcastIterator() = default; + inline size_t GetInputPosA() const { return input_pos_[0]; } + inline size_t GetInputPosB() const { return input_pos_[1]; } + void SetPos(size_t pos); + void GenNextPos(); + + private: + void BroadcastShape(); + void InitStrides(); + + std::vector coordinates_; + std::vector input_shape_a_; + std::vector input_shape_b_; + std::vector output_shape_; + std::vector input_strides_a_; + std::vector input_strides_b_; + std::vector input_back_strides_a_; + std::vector input_back_strides_b_; + std::array input_pos_{0}; + int output_dimension_{0}; +}; + +class TransposeIterator { + public: + TransposeIterator(std::vector output_shape, std::vector axes, const std::vector &input_shape); + virtual ~TransposeIterator() = default; + inline size_t GetPos() const { return pos_; } + void SetPos(size_t pos); + void GenNextPos(); + + private: + int dimension_{0}; + std::vector coordinates_; + std::vector shape_; + std::vector strides_; + std::vector back_strides_; + std::vector axes_; + size_t pos_{0}; +}; +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.cc index 0ae6608bf2a..880325e86f0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.cc @@ -1,340 +1,340 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h" -#include "runtime/device/cpu/cpu_device_address.h" - -namespace mindspore { -namespace kernel { -void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) { - CheckParam(kernel_node); - probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); - indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); - labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); - dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); - - if (probs_shape_.size() != 3) { - MS_LOG(EXCEPTION) << "Probs dims: " << probs_shape_.size() << " not support."; - } - if (labels_dims_.size() != 1) { - MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support."; - } - if (indice_dims_.size() != 2) { - MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support."; - } - - preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr(kernel_node, "preprocess_collapse_repeated"); - ctc_merge_repeated_ = AnfAlgo::GetNodeAttr(kernel_node, "ctc_merge_repeated"); - ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr(kernel_node, "ignore_longer_outputs_than_inputs"); - - max_time_ = probs_shape_[0]; - batch_size_ = probs_shape_[1]; - num_class_ = probs_shape_[2]; - blank_index_ = num_class_ - 1; -} - -bool CTCLossCPUKernel::Launch(const std::vector &inputs, const std::vector &, - const std::vector &outputs) { - if (dtype_ == kNumberTypeFloat16) { - LaunchKernel(inputs, outputs); - } else if (dtype_ == kNumberTypeFloat32) { - LaunchKernel(inputs, outputs); - } - return true; -} - -template -inline T LogSumExp(const T logprob1, const T logprob2) { - T kLogZero_ = -std::numeric_limits::infinity(); - if (logprob1 <= kLogZero_) { - return logprob2; - } else if (logprob2 <= kLogZero_) { - return logprob1; - } else { - return (logprob1 > logprob2) ? logprob1 + static_cast(log1p(exp(logprob2 - logprob1))) - : logprob2 + static_cast(log1p(exp(logprob1 - logprob2))); - } -} - -template -void CTCLossCPUKernel::CalculateFwdVar(const std::vector &label_with_blank, - const std::vector> &y, - std::vector> *log_alpha_b) { - int U = label_with_blank.size(); - int T = (*log_alpha_b)[0].size(); - TT kLogZero_ = -std::numeric_limits::infinity(); - - (*log_alpha_b)[0][0] = static_cast(log(y[blank_index_][0])); - auto label_0 = (label_with_blank.size() > 1) ? label_with_blank[1] : blank_index_; - if (label_with_blank.size() > 1) { - (*log_alpha_b)[1][0] = static_cast(log(y[label_0][0])); - } - - for (int t = 1; t < T; ++t) { - int low = std::max(0, U - (2 * (T - t))); - int high = std::min(U, 2 * (t + 1)); - for (int u = low; u < high; ++u) { - auto sum_log_alpha_b = kLogZero_; - if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) { - sum_log_alpha_b = (*log_alpha_b)[u][t - 1]; - } - - if (u > 0) { - sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 1][t - 1]); - } - - if (u > 1) { - bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u - 2]); - if (label_with_blank[u] != blank_index_ && !matching_labels_merge) { - sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 2][t - 1]); - } - } - - (*log_alpha_b)[u][t] = - static_cast(log(static_cast(y[label_with_blank[IntToSize(u)]][IntToSize(t)]))) + sum_log_alpha_b; - } - } -} - -template -void CTCLossCPUKernel::CalculateBwdVar(const std::vector &label_with_blank, - const std::vector> &y, - std::vector> *log_beta_b) { - int T = (*log_beta_b)[0].size(); - int U = label_with_blank.size(); - if (U > 1) { - for (int u = U - 2; u < U; ++u) { - (*log_beta_b)[u][T - 1] = TT(0); - } - } else { - (*log_beta_b)[0][T - 1] = TT(0); - (*log_beta_b)[0][T - 2] = TT(0); - } - - for (int t = T - 2; t >= 0; --t) { - int low = std::max(0, U - (2 * (T - t))); - int high = std::min(U, 2 * (t + 1)); - for (int u = low; u < high; ++u) { - if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) { - (*log_beta_b)[u][t] = - LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u][t + 1] + TT(log(y[label_with_blank[u]][t + 1]))); - } - - if (u + 1 < U) { - (*log_beta_b)[u][t] = - LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 1][t + 1] + TT(log(y[label_with_blank[u + 1]][t + 1]))); - } - - if (u + 2 < U) { - bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u + 2]); - if (label_with_blank[u] != blank_index_ && !matching_labels_merge) { - (*log_beta_b)[u][t] = - LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 2][t + 1] + TT(log(y[label_with_blank[u + 2]][t + 1]))); - } - } - } - } -} - -template -void CTCLossCPUKernel::CalculateGrad(const std::vector &label_with_blank, - const std::vector> &y, - const std::vector> &log_alpha_b, - const std::vector> &log_beta_b, const TT log_pzx, - std::vector> *dy) { - auto dy_b = dy; - TT kLogZero_ = -std::numeric_limits::infinity(); - if (log_pzx <= kLogZero_) { - MS_LOG(INFO) << "No valid path found"; - return; - } - - size_t L = y.size(); - size_t T = y[0].size(); - size_t U = label_with_blank.size(); - - for (size_t t = 0; t < T; ++t) { - std::vector prob_sum(L, kLogZero_); - - for (size_t u = 0; u < U; ++u) { - uint32_t l = label_with_blank[u]; - prob_sum[l] = LogSumExp(prob_sum[l], log_alpha_b[u][t] + log_beta_b[u][t]); - } - for (size_t l = 0; l < L; ++l) { - (*dy_b)[l][t] = y[l][t] - static_cast(exp(prob_sum[l] - log_pzx)); - } - } -} - -void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector> &batch_label, - std::vector> *label_with_blank) { - for (size_t b = 0; b < batch_size_; ++b) { - std::vector l; - const std::vector &label = batch_label[b]; - bool has_blank = false; - for (size_t i = 0; i < label.size(); ++i) { - if (i == 0 || !preprocess_collapse_repeated_ || label[i] != label[i - 1]) { - if (label[i] >= num_class_ - 1) { - has_blank = true; - } else { - if (has_blank) { - MS_LOG(EXCEPTION) << "Invalid labels(index >= num_class - 1) should not appear between two valid labels"; - } - l.push_back(label[i]); - } - } - } - if (!ignore_longer_outputs_than_inputs_) { - if (l.size() > seq_len[b]) { - MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets " - << seq_len[b] << "< " << l.size(); - } - } - - (*label_with_blank)[b].reserve(2 * l.size() + 1); - for (auto l_i : l) { - (*label_with_blank)[b].push_back(blank_index_); - (*label_with_blank)[b].push_back(l_i); - } - (*label_with_blank)[b].push_back(blank_index_); - } -} - -template -void InnerSoftMax(const T *inputs_addr, std::vector> *softmax_probs, const uint32_t sequence_length, - size_t num_class, size_t batch_size, size_t b) { - for (size_t t = 0; t < sequence_length; ++t) { - T maxCoeff(T(0)); - T sumCoeff(T(0)); - - for (size_t c = 0; c < num_class; ++c) { - if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) { - maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c]; - } - } - - for (size_t c = 0; c < num_class; ++c) { - sumCoeff += static_cast(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); - (*softmax_probs)[c][t] = - static_cast(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); - } - - for (size_t c = 0; c < num_class; ++c) { - (*softmax_probs)[c][t] /= sumCoeff; - } - } -} - -template -void MatrixfromVector(uint32_t row, uint32_t col, std::vector> *array2D, const T init_value) { - array2D->resize(row); - for (size_t i = 0; i < row; ++i) { - (*array2D)[i].resize(col, init_value); - } -} - -template -void CTCLossCPUKernel::LaunchKernel(const std::vector &inputs, const std::vector &outputs) { - auto inputs_addr = reinterpret_cast(inputs[0]->addr); - auto labels_indices_addr = reinterpret_cast(inputs[1]->addr); - auto labels_values_addr = reinterpret_cast(inputs[2]->addr); - auto sequence_length_addr = reinterpret_cast(inputs[3]->addr); - auto loss_addr = reinterpret_cast(outputs[0]->addr); - auto gradient_addr = reinterpret_cast(outputs[1]->addr); - - std::vector> label_batch; - std::vector> labels_with_blank; - std::vector each_label_length; - - label_batch.resize(batch_size_); - labels_with_blank.resize(batch_size_); - each_label_length.resize(batch_size_, 0); - - T kLogZero_ = -std::numeric_limits::infinity(); - // check validation of sequence length - for (size_t b = 0; b < batch_size_; ++b) { - if (sequence_length_addr[b] == uint32_t(0)) { - MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b]; - } - - if (sequence_length_addr[b] > max_time_) { - MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < " - << sequence_length_addr[b]; - } - } - - for (size_t i = 0; i < indice_dims_[0]; ++i) { - each_label_length[labels_indices_addr[i * 2]]++; - } - - // convert label format of label_value and label_indices to batch_label - uint64_t cum_sum = 0; - for (size_t b = 0; b < batch_size_; ++b) { - std::vector *b_value = &label_batch[b]; - for (size_t l = 0; l < each_label_length[b]; ++l) { - b_value->push_back(labels_values_addr[cum_sum + l]); - } - cum_sum += each_label_length[b]; - } - - // convert label to label with blank - GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank); - - for (size_t b = 0; b < batch_size_; ++b) { - std::vector label_with_blank = labels_with_blank[b]; - // y_b [num_class, sequence_length] - std::vector> y_b; - std::vector> dy; - std::vector> log_alpha_b; - std::vector> log_beta_b; - MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_); - MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0)); - MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_); - MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_); - InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b); - - CalculateFwdVar(label_with_blank, y_b, &log_alpha_b); - CalculateBwdVar(label_with_blank, y_b, &log_beta_b); - - T log_pzx = kLogZero_; - for (size_t u = 0; u < label_with_blank.size(); ++u) { - log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]); - } - - loss_addr[b] = -log_pzx; - - CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy); - - for (size_t t = 0; t < sequence_length_addr[b]; ++t) { - for (size_t c = 0; c < num_class_; ++c) { - gradient_addr[t * batch_size_ * num_class_ + b * num_class_ + c] = dy[c][t]; - } - } - } -} - -void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) { - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - if (input_num != 4) { - MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num; - } - size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - if (output_num != 2) { - MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num; - } -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h" +#include "runtime/device/cpu/cpu_device_address.h" + +namespace mindspore { +namespace kernel { +void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) { + CheckParam(kernel_node); + probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); + labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); + dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); + + if (probs_shape_.size() != 3) { + MS_LOG(EXCEPTION) << "Probs dims: " << probs_shape_.size() << " not support."; + } + if (labels_dims_.size() != 1) { + MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support."; + } + if (indice_dims_.size() != 2) { + MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support."; + } + + preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr(kernel_node, "preprocess_collapse_repeated"); + ctc_merge_repeated_ = AnfAlgo::GetNodeAttr(kernel_node, "ctc_merge_repeated"); + ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr(kernel_node, "ignore_longer_outputs_than_inputs"); + + max_time_ = probs_shape_[0]; + batch_size_ = probs_shape_[1]; + num_class_ = probs_shape_[2]; + blank_index_ = num_class_ - 1; +} + +bool CTCLossCPUKernel::Launch(const std::vector &inputs, const std::vector &, + const std::vector &outputs) { + if (dtype_ == kNumberTypeFloat16) { + LaunchKernel(inputs, outputs); + } else if (dtype_ == kNumberTypeFloat32) { + LaunchKernel(inputs, outputs); + } + return true; +} + +template +inline T LogSumExp(const T logprob1, const T logprob2) { + T kLogZero_ = -std::numeric_limits::infinity(); + if (logprob1 <= kLogZero_) { + return logprob2; + } else if (logprob2 <= kLogZero_) { + return logprob1; + } else { + return (logprob1 > logprob2) ? logprob1 + static_cast(log1p(exp(logprob2 - logprob1))) + : logprob2 + static_cast(log1p(exp(logprob1 - logprob2))); + } +} + +template +void CTCLossCPUKernel::CalculateFwdVar(const std::vector &label_with_blank, + const std::vector> &y, + std::vector> *log_alpha_b) { + int U = label_with_blank.size(); + int T = (*log_alpha_b)[0].size(); + TT kLogZero_ = -std::numeric_limits::infinity(); + + (*log_alpha_b)[0][0] = static_cast(log(y[blank_index_][0])); + auto label_0 = (label_with_blank.size() > 1) ? label_with_blank[1] : blank_index_; + if (label_with_blank.size() > 1) { + (*log_alpha_b)[1][0] = static_cast(log(y[label_0][0])); + } + + for (int t = 1; t < T; ++t) { + int low = std::max(0, U - (2 * (T - t))); + int high = std::min(U, 2 * (t + 1)); + for (int u = low; u < high; ++u) { + auto sum_log_alpha_b = kLogZero_; + if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) { + sum_log_alpha_b = (*log_alpha_b)[u][t - 1]; + } + + if (u > 0) { + sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 1][t - 1]); + } + + if (u > 1) { + bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u - 2]); + if (label_with_blank[u] != blank_index_ && !matching_labels_merge) { + sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 2][t - 1]); + } + } + + (*log_alpha_b)[u][t] = + static_cast(log(static_cast(y[label_with_blank[IntToSize(u)]][IntToSize(t)]))) + sum_log_alpha_b; + } + } +} + +template +void CTCLossCPUKernel::CalculateBwdVar(const std::vector &label_with_blank, + const std::vector> &y, + std::vector> *log_beta_b) { + int T = (*log_beta_b)[0].size(); + int U = label_with_blank.size(); + if (U > 1) { + for (int u = U - 2; u < U; ++u) { + (*log_beta_b)[u][T - 1] = TT(0); + } + } else { + (*log_beta_b)[0][T - 1] = TT(0); + (*log_beta_b)[0][T - 2] = TT(0); + } + + for (int t = T - 2; t >= 0; --t) { + int low = std::max(0, U - (2 * (T - t))); + int high = std::min(U, 2 * (t + 1)); + for (int u = low; u < high; ++u) { + if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) { + (*log_beta_b)[u][t] = + LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u][t + 1] + TT(log(y[label_with_blank[u]][t + 1]))); + } + + if (u + 1 < U) { + (*log_beta_b)[u][t] = + LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 1][t + 1] + TT(log(y[label_with_blank[u + 1]][t + 1]))); + } + + if (u + 2 < U) { + bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u + 2]); + if (label_with_blank[u] != blank_index_ && !matching_labels_merge) { + (*log_beta_b)[u][t] = + LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 2][t + 1] + TT(log(y[label_with_blank[u + 2]][t + 1]))); + } + } + } + } +} + +template +void CTCLossCPUKernel::CalculateGrad(const std::vector &label_with_blank, + const std::vector> &y, + const std::vector> &log_alpha_b, + const std::vector> &log_beta_b, const TT log_pzx, + std::vector> *dy) { + auto dy_b = dy; + TT kLogZero_ = -std::numeric_limits::infinity(); + if (log_pzx <= kLogZero_) { + MS_LOG(INFO) << "No valid path found"; + return; + } + + size_t L = y.size(); + size_t T = y[0].size(); + size_t U = label_with_blank.size(); + + for (size_t t = 0; t < T; ++t) { + std::vector prob_sum(L, kLogZero_); + + for (size_t u = 0; u < U; ++u) { + uint32_t l = label_with_blank[u]; + prob_sum[l] = LogSumExp(prob_sum[l], log_alpha_b[u][t] + log_beta_b[u][t]); + } + for (size_t l = 0; l < L; ++l) { + (*dy_b)[l][t] = y[l][t] - static_cast(exp(prob_sum[l] - log_pzx)); + } + } +} + +void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector> &batch_label, + std::vector> *label_with_blank) { + for (size_t b = 0; b < batch_size_; ++b) { + std::vector l; + const std::vector &label = batch_label[b]; + bool has_blank = false; + for (size_t i = 0; i < label.size(); ++i) { + if (i == 0 || !preprocess_collapse_repeated_ || label[i] != label[i - 1]) { + if (label[i] >= num_class_ - 1) { + has_blank = true; + } else { + if (has_blank) { + MS_LOG(EXCEPTION) << "Invalid labels(index >= num_class - 1) should not appear between two valid labels"; + } + l.push_back(label[i]); + } + } + } + if (!ignore_longer_outputs_than_inputs_) { + if (l.size() > seq_len[b]) { + MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets " + << seq_len[b] << "< " << l.size(); + } + } + + (*label_with_blank)[b].reserve(2 * l.size() + 1); + for (auto l_i : l) { + (*label_with_blank)[b].push_back(blank_index_); + (*label_with_blank)[b].push_back(l_i); + } + (*label_with_blank)[b].push_back(blank_index_); + } +} + +template +void InnerSoftMax(const T *inputs_addr, std::vector> *softmax_probs, const uint32_t sequence_length, + size_t num_class, size_t batch_size, size_t b) { + for (size_t t = 0; t < sequence_length; ++t) { + T maxCoeff(T(0)); + T sumCoeff(T(0)); + + for (size_t c = 0; c < num_class; ++c) { + if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) { + maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c]; + } + } + + for (size_t c = 0; c < num_class; ++c) { + sumCoeff += static_cast(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); + (*softmax_probs)[c][t] = + static_cast(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); + } + + for (size_t c = 0; c < num_class; ++c) { + (*softmax_probs)[c][t] /= sumCoeff; + } + } +} + +template +void MatrixfromVector(uint32_t row, uint32_t col, std::vector> *array2D, const T init_value) { + array2D->resize(row); + for (size_t i = 0; i < row; ++i) { + (*array2D)[i].resize(col, init_value); + } +} + +template +void CTCLossCPUKernel::LaunchKernel(const std::vector &inputs, const std::vector &outputs) { + auto inputs_addr = reinterpret_cast(inputs[0]->addr); + auto labels_indices_addr = reinterpret_cast(inputs[1]->addr); + auto labels_values_addr = reinterpret_cast(inputs[2]->addr); + auto sequence_length_addr = reinterpret_cast(inputs[3]->addr); + auto loss_addr = reinterpret_cast(outputs[0]->addr); + auto gradient_addr = reinterpret_cast(outputs[1]->addr); + + std::vector> label_batch; + std::vector> labels_with_blank; + std::vector each_label_length; + + label_batch.resize(batch_size_); + labels_with_blank.resize(batch_size_); + each_label_length.resize(batch_size_, 0); + + T kLogZero_ = -std::numeric_limits::infinity(); + // check validation of sequence length + for (size_t b = 0; b < batch_size_; ++b) { + if (sequence_length_addr[b] == uint32_t(0)) { + MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b]; + } + + if (sequence_length_addr[b] > max_time_) { + MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < " + << sequence_length_addr[b]; + } + } + + for (size_t i = 0; i < indice_dims_[0]; ++i) { + each_label_length[labels_indices_addr[i * 2]]++; + } + + // convert label format of label_value and label_indices to batch_label + uint64_t cum_sum = 0; + for (size_t b = 0; b < batch_size_; ++b) { + std::vector *b_value = &label_batch[b]; + for (size_t l = 0; l < each_label_length[b]; ++l) { + b_value->push_back(labels_values_addr[cum_sum + l]); + } + cum_sum += each_label_length[b]; + } + + // convert label to label with blank + GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank); + + for (size_t b = 0; b < batch_size_; ++b) { + std::vector label_with_blank = labels_with_blank[b]; + // y_b [num_class, sequence_length] + std::vector> y_b; + std::vector> dy; + std::vector> log_alpha_b; + std::vector> log_beta_b; + MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_); + MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0)); + MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_); + MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_); + InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b); + + CalculateFwdVar(label_with_blank, y_b, &log_alpha_b); + CalculateBwdVar(label_with_blank, y_b, &log_beta_b); + + T log_pzx = kLogZero_; + for (size_t u = 0; u < label_with_blank.size(); ++u) { + log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]); + } + + loss_addr[b] = -log_pzx; + + CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy); + + for (size_t t = 0; t < sequence_length_addr[b]; ++t) { + for (size_t c = 0; c < num_class_; ++c) { + gradient_addr[t * batch_size_ * num_class_ + b * num_class_ + c] = dy[c][t]; + } + } + } +} + +void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) { + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); + if (input_num != 4) { + MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num; + } + size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); + if (output_num != 2) { + MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num; + } +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h index bdd6c6b9df1..271f4aaf009 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h @@ -1,92 +1,92 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ -#include -#include -#include -#include -#include -#include "backend/kernel_compiler/cpu/cpu_kernel.h" -#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" - -namespace mindspore { -namespace kernel { -class CTCLossCPUKernel : public CPUKernel { - public: - CTCLossCPUKernel() = default; - ~CTCLossCPUKernel() override = default; - - void InitKernel(const CNodePtr &kernel_node) override; - - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - void GenLableWithBlank(const uint32_t *seq_len, const std::vector> &batch_label, - std::vector> *label_with_blank); - - template - void CalculateFwdVar(const std::vector &label_with_blank, const std::vector> &y, - std::vector> *log_alpha_b); - template - void CalculateBwdVar(const std::vector &label_with_blank, const std::vector> &y, - std::vector> *log_beta_b); - template - void CalculateGrad(const std::vector &label_with_blank, const std::vector> &y, - const std::vector> &log_alpha_b, const std::vector> &log_beta_b, - const T log_pzx, std::vector> *dy); - - template - void LaunchKernel(const std::vector &inputs, const std::vector &outputs); - - private: - void CheckParam(const CNodePtr &kernel_node); - std::vector probs_shape_; - std::vector indice_dims_; - std::vector labels_dims_; - size_t num_class_; - size_t max_time_; - size_t batch_size_; - uint32_t blank_index_; - TypeId dtype_{kTypeUnknown}; - bool preprocess_collapse_repeated_; - bool ctc_merge_repeated_; - bool ignore_longer_outputs_than_inputs_; -}; - -MS_REG_CPU_KERNEL(CTCLoss, - KernelAttr() - .AddInputAttr(kNumberTypeFloat16) - .AddInputAttr(kNumberTypeInt64) - .AddInputAttr(kNumberTypeInt32) - .AddInputAttr(kNumberTypeInt32) - .AddOutputAttr(kNumberTypeFloat16) - .AddOutputAttr(kNumberTypeFloat16), - CTCLossCPUKernel); - -MS_REG_CPU_KERNEL(CTCLoss, - KernelAttr() - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeInt64) - .AddInputAttr(kNumberTypeInt32) - .AddInputAttr(kNumberTypeInt32) - .AddOutputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32), - CTCLossCPUKernel); -} // namespace kernel -} // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ +#include +#include +#include +#include +#include +#include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" + +namespace mindspore { +namespace kernel { +class CTCLossCPUKernel : public CPUKernel { + public: + CTCLossCPUKernel() = default; + ~CTCLossCPUKernel() override = default; + + void InitKernel(const CNodePtr &kernel_node) override; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + void GenLableWithBlank(const uint32_t *seq_len, const std::vector> &batch_label, + std::vector> *label_with_blank); + + template + void CalculateFwdVar(const std::vector &label_with_blank, const std::vector> &y, + std::vector> *log_alpha_b); + template + void CalculateBwdVar(const std::vector &label_with_blank, const std::vector> &y, + std::vector> *log_beta_b); + template + void CalculateGrad(const std::vector &label_with_blank, const std::vector> &y, + const std::vector> &log_alpha_b, const std::vector> &log_beta_b, + const T log_pzx, std::vector> *dy); + + template + void LaunchKernel(const std::vector &inputs, const std::vector &outputs); + + private: + void CheckParam(const CNodePtr &kernel_node); + std::vector probs_shape_; + std::vector indice_dims_; + std::vector labels_dims_; + size_t num_class_; + size_t max_time_; + size_t batch_size_; + uint32_t blank_index_; + TypeId dtype_{kTypeUnknown}; + bool preprocess_collapse_repeated_; + bool ctc_merge_repeated_; + bool ignore_longer_outputs_than_inputs_; +}; + +MS_REG_CPU_KERNEL(CTCLoss, + KernelAttr() + .AddInputAttr(kNumberTypeFloat16) + .AddInputAttr(kNumberTypeInt64) + .AddInputAttr(kNumberTypeInt32) + .AddInputAttr(kNumberTypeInt32) + .AddOutputAttr(kNumberTypeFloat16) + .AddOutputAttr(kNumberTypeFloat16), + CTCLossCPUKernel); + +MS_REG_CPU_KERNEL(CTCLoss, + KernelAttr() + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeInt64) + .AddInputAttr(kNumberTypeInt32) + .AddInputAttr(kNumberTypeInt32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32), + CTCLossCPUKernel); +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc index b0aa95a5a00..70df7e22063 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc @@ -1,89 +1,89 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h" - -#include - -#include "runtime/device/cpu/cpu_device_address.h" - -namespace mindspore { -namespace kernel { -template -void DepthToSpaceCPUKernel::InitKernel(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - CheckParam(kernel_node); - input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); - block_size_ = AnfAlgo::GetNodeAttr(kernel_node, "block_size"); -} - -template -bool DepthToSpaceCPUKernel::Launch(const std::vector &inputs, - const std::vector & /*workspace*/, - const std::vector &outputs) { - auto input_addr = reinterpret_cast(inputs[0]->addr); - auto output_addr = reinterpret_cast(outputs[0]->addr); - size_t size = IntToSize(inputs[0]->size / sizeof(T)); - std::vector input_shape = input_shape_; - std::vector output_shape = output_shape_; - size_t block_size = block_size_; - size_t input_dimension = input_shape.size(); - size_t output_strides[3] = {1, 1, 1}; - - for (size_t i = input_dimension - 1; i >= 1; --i) { - for (size_t j = 0; j < i; ++j) { - output_strides[j] *= output_shape[i]; - } - } - - auto task = [&, input_addr, output_addr](size_t start, size_t end) { - std::vector output_pos_array(input_dimension, 0); - for (size_t i = start; i < end; ++i) { - size_t tmp_pos = i; - for (size_t j = 0; j < input_dimension - 1; ++j) { - output_pos_array[j] = tmp_pos / output_strides[j]; - tmp_pos %= output_strides[j]; - } - output_pos_array.back() = tmp_pos; - size_t input_pos = output_pos_array[0]; - input_pos = - (input_pos * input_shape[1]) + - (output_pos_array[1] + - (block_size * (output_pos_array[2] % block_size) + output_pos_array[3] % block_size) * output_shape[1]); - input_pos = (input_pos * input_shape[2]) + (output_pos_array[2] / block_size); - input_pos = (input_pos * input_shape[3]) + (output_pos_array[3] / block_size); - output_addr[i] = input_addr[input_pos]; - } - }; - - CPUKernelUtils::ParallelFor(task, size); - return true; -} - -template -void DepthToSpaceCPUKernel::CheckParam(const CNodePtr &kernel_node) { - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - if (input_num != 1) { - MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input."; - } - size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - if (output_num != 1) { - MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output."; - } -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h" + +#include + +#include "runtime/device/cpu/cpu_device_address.h" + +namespace mindspore { +namespace kernel { +template +void DepthToSpaceCPUKernel::InitKernel(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + CheckParam(kernel_node); + input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); + block_size_ = AnfAlgo::GetNodeAttr(kernel_node, "block_size"); +} + +template +bool DepthToSpaceCPUKernel::Launch(const std::vector &inputs, + const std::vector & /*workspace*/, + const std::vector &outputs) { + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto output_addr = reinterpret_cast(outputs[0]->addr); + size_t size = IntToSize(inputs[0]->size / sizeof(T)); + std::vector input_shape = input_shape_; + std::vector output_shape = output_shape_; + size_t block_size = block_size_; + size_t input_dimension = input_shape.size(); + size_t output_strides[3] = {1, 1, 1}; + + for (size_t i = input_dimension - 1; i >= 1; --i) { + for (size_t j = 0; j < i; ++j) { + output_strides[j] *= output_shape[i]; + } + } + + auto task = [&, input_addr, output_addr](size_t start, size_t end) { + std::vector output_pos_array(input_dimension, 0); + for (size_t i = start; i < end; ++i) { + size_t tmp_pos = i; + for (size_t j = 0; j < input_dimension - 1; ++j) { + output_pos_array[j] = tmp_pos / output_strides[j]; + tmp_pos %= output_strides[j]; + } + output_pos_array.back() = tmp_pos; + size_t input_pos = output_pos_array[0]; + input_pos = + (input_pos * input_shape[1]) + + (output_pos_array[1] + + (block_size * (output_pos_array[2] % block_size) + output_pos_array[3] % block_size) * output_shape[1]); + input_pos = (input_pos * input_shape[2]) + (output_pos_array[2] / block_size); + input_pos = (input_pos * input_shape[3]) + (output_pos_array[3] / block_size); + output_addr[i] = input_addr[input_pos]; + } + }; + + CPUKernelUtils::ParallelFor(task, size); + return true; +} + +template +void DepthToSpaceCPUKernel::CheckParam(const CNodePtr &kernel_node) { + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); + if (input_num != 1) { + MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input."; + } + size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); + if (output_num != 1) { + MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output."; + } +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h index 57e4b8339fd..d7de32f5f94 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h @@ -1,85 +1,85 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ -#include -#include -#include - -#include "backend/kernel_compiler/cpu/cpu_kernel.h" -#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" -namespace mindspore { -namespace kernel { -template -class DepthToSpaceCPUKernel : public CPUKernel { - public: - DepthToSpaceCPUKernel() = default; - ~DepthToSpaceCPUKernel() override = default; - - void InitKernel(const CNodePtr &kernel_node) override; - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - private: - void CheckParam(const CNodePtr &kernel_node); - std::vector input_shape_; - std::vector output_shape_; - size_t block_size_; -}; - -MS_REG_CPU_KERNEL_T( - DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), - DepthToSpaceCPUKernel, float); - -MS_REG_CPU_KERNEL_T( - DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), - DepthToSpaceCPUKernel, float16); - -MS_REG_CPU_KERNEL_T(DepthToSpace, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), - DepthToSpaceCPUKernel, int8_t); - -MS_REG_CPU_KERNEL_T(DepthToSpace, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), - DepthToSpaceCPUKernel, int16_t); - -MS_REG_CPU_KERNEL_T(DepthToSpace, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), - DepthToSpaceCPUKernel, int); - -MS_REG_CPU_KERNEL_T(DepthToSpace, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), - DepthToSpaceCPUKernel, int64_t); - -MS_REG_CPU_KERNEL_T(DepthToSpace, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), - DepthToSpaceCPUKernel, uint8_t); - -MS_REG_CPU_KERNEL_T(DepthToSpace, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16), - DepthToSpaceCPUKernel, uint16_t); - -MS_REG_CPU_KERNEL_T(DepthToSpace, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32), - DepthToSpaceCPUKernel, uint32_t); - -MS_REG_CPU_KERNEL_T(DepthToSpace, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64), - DepthToSpaceCPUKernel, uint64_t); - -} // namespace kernel -} // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ +#include +#include +#include + +#include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" +namespace mindspore { +namespace kernel { +template +class DepthToSpaceCPUKernel : public CPUKernel { + public: + DepthToSpaceCPUKernel() = default; + ~DepthToSpaceCPUKernel() override = default; + + void InitKernel(const CNodePtr &kernel_node) override; + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + void CheckParam(const CNodePtr &kernel_node); + std::vector input_shape_; + std::vector output_shape_; + size_t block_size_; +}; + +MS_REG_CPU_KERNEL_T( + DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), + DepthToSpaceCPUKernel, float); + +MS_REG_CPU_KERNEL_T( + DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), + DepthToSpaceCPUKernel, float16); + +MS_REG_CPU_KERNEL_T(DepthToSpace, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), + DepthToSpaceCPUKernel, int8_t); + +MS_REG_CPU_KERNEL_T(DepthToSpace, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), + DepthToSpaceCPUKernel, int16_t); + +MS_REG_CPU_KERNEL_T(DepthToSpace, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), + DepthToSpaceCPUKernel, int); + +MS_REG_CPU_KERNEL_T(DepthToSpace, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), + DepthToSpaceCPUKernel, int64_t); + +MS_REG_CPU_KERNEL_T(DepthToSpace, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), + DepthToSpaceCPUKernel, uint8_t); + +MS_REG_CPU_KERNEL_T(DepthToSpace, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16), + DepthToSpaceCPUKernel, uint16_t); + +MS_REG_CPU_KERNEL_T(DepthToSpace, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32), + DepthToSpaceCPUKernel, uint32_t); + +MS_REG_CPU_KERNEL_T(DepthToSpace, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64), + DepthToSpaceCPUKernel, uint64_t); + +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.cc index 32c45a117a7..ac452c13059 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.cc @@ -1,102 +1,102 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h" -#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" -#include "runtime/device/cpu/cpu_device_address.h" -#include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h" -#include "backend/kernel_compiler/cpu/nnacl/errorcode.h" -#include "utils/ms_utils.h" -#include "common/thread_pool.h" - -namespace mindspore { -namespace kernel { -void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) { - int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start); - if (ret != NNACL_OK) { - MS_LOG(EXCEPTION) << "Add failed."; - } -} - -void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - CheckParam(kernel_node); - input_num_ = AnfAlgo::GetInputTensorNum(kernel_node); - dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); - std::vector src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - std::vector src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); - std::vector dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); - dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape); - dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape); - dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape); - dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc); - auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine()); - primitive_ = std::make_shared(prim_desc); - AddArgument(DNNL_ARG_SRC_0, src0_mem_desc); - AddArgument(DNNL_ARG_SRC_1, src1_mem_desc); - AddArgument(DNNL_ARG_DST, dst_mem_desc); -} - -bool AddNCPUKernel::Launch(const std::vector &inputs, const std::vector &, - const std::vector &outputs) { - if (dtype_ == kNumberTypeFloat32) { - SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr); - SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr); - SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); - ExecutePrimitive(); - for (size_t index = 2; index < input_num_; ++index) { - SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr); - SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr); - SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); - ExecutePrimitive(); - } - } else if (dtype_ == kNumberTypeInt32) { - size_t elements_num = outputs[0]->size / sizeof(int); - const auto input_0 = reinterpret_cast(inputs[0]->addr); - const auto input_1 = reinterpret_cast(inputs[1]->addr); - auto output = reinterpret_cast(outputs[0]->addr); - auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2); - CPUKernelUtils::ParallelFor(task_0, elements_num); - for (size_t index = 2; index < input_num_; ++index) { - const auto input = reinterpret_cast(inputs[index]->addr); - auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2); - CPUKernelUtils::ParallelFor(task, elements_num); - } - } else { - MS_LOG(EXCEPTION) << "AddN only support float32 and int32, but got " << TypeIdToType(dtype_)->ToString(); - } - return true; -} - -void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) { - auto src0_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); - auto dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); - if (src0_shape != dst_shape) { - MS_LOG(EXCEPTION) << "AddN output shape must be equal to input shape."; - } - for (size_t index = 1; index < input_num_; ++index) { - auto src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index); - if (src0_shape != src_shape) { - MS_LOG(EXCEPTION) << "AddN input shapes must be equal."; - } - } - size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - if (output_num != 1) { - MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output."; - } -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h" +#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" +#include "runtime/device/cpu/cpu_device_address.h" +#include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h" +#include "backend/kernel_compiler/cpu/nnacl/errorcode.h" +#include "utils/ms_utils.h" +#include "common/thread_pool.h" + +namespace mindspore { +namespace kernel { +void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) { + int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start); + if (ret != NNACL_OK) { + MS_LOG(EXCEPTION) << "Add failed."; + } +} + +void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + CheckParam(kernel_node); + input_num_ = AnfAlgo::GetInputTensorNum(kernel_node); + dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); + std::vector src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + std::vector src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); + std::vector dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); + dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape); + dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape); + dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape); + dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc); + auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine()); + primitive_ = std::make_shared(prim_desc); + AddArgument(DNNL_ARG_SRC_0, src0_mem_desc); + AddArgument(DNNL_ARG_SRC_1, src1_mem_desc); + AddArgument(DNNL_ARG_DST, dst_mem_desc); +} + +bool AddNCPUKernel::Launch(const std::vector &inputs, const std::vector &, + const std::vector &outputs) { + if (dtype_ == kNumberTypeFloat32) { + SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr); + SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr); + SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); + ExecutePrimitive(); + for (size_t index = 2; index < input_num_; ++index) { + SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr); + SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr); + SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); + ExecutePrimitive(); + } + } else if (dtype_ == kNumberTypeInt32) { + size_t elements_num = outputs[0]->size / sizeof(int); + const auto input_0 = reinterpret_cast(inputs[0]->addr); + const auto input_1 = reinterpret_cast(inputs[1]->addr); + auto output = reinterpret_cast(outputs[0]->addr); + auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2); + CPUKernelUtils::ParallelFor(task_0, elements_num); + for (size_t index = 2; index < input_num_; ++index) { + const auto input = reinterpret_cast(inputs[index]->addr); + auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2); + CPUKernelUtils::ParallelFor(task, elements_num); + } + } else { + MS_LOG(EXCEPTION) << "AddN only support float32 and int32, but got " << TypeIdToType(dtype_)->ToString(); + } + return true; +} + +void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) { + auto src0_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + auto dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); + if (src0_shape != dst_shape) { + MS_LOG(EXCEPTION) << "AddN output shape must be equal to input shape."; + } + for (size_t index = 1; index < input_num_; ++index) { + auto src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index); + if (src0_shape != src_shape) { + MS_LOG(EXCEPTION) << "AddN input shapes must be equal."; + } + } + size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); + if (output_num != 1) { + MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output."; + } +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h index 21547d6082f..a47861fa7be 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h @@ -1,51 +1,51 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ -#include -#include -#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" - -namespace mindspore { -namespace kernel { -class AddNCPUKernel : public MKLCPUKernel { - public: - AddNCPUKernel() = default; - ~AddNCPUKernel() override = default; - - void InitKernel(const CNodePtr &kernel_node) override; - - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - private: - void CheckParam(const CNodePtr &kernel_node); - size_t input_num_{0}; - std::vector output_shape_; - TypeId dtype_{kNumberTypeFloat32}; -}; - -MS_REG_CPU_KERNEL(AddN, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), - AddNCPUKernel); -MS_REG_CPU_KERNEL(AddN, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), - AddNCPUKernel); -} // namespace kernel -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ +#include +#include +#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" + +namespace mindspore { +namespace kernel { +class AddNCPUKernel : public MKLCPUKernel { + public: + AddNCPUKernel() = default; + ~AddNCPUKernel() override = default; + + void InitKernel(const CNodePtr &kernel_node) override; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + void CheckParam(const CNodePtr &kernel_node); + size_t input_num_{0}; + std::vector output_shape_; + TypeId dtype_{kNumberTypeFloat32}; +}; + +MS_REG_CPU_KERNEL(AddN, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), + AddNCPUKernel); +MS_REG_CPU_KERNEL(AddN, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), + AddNCPUKernel); +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc index 82e2780dec8..b0277203dbf 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc @@ -1,178 +1,178 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h" -#include -#include "utils/ms_utils.h" -#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" -#include "runtime/device/cpu/cpu_device_address.h" - -namespace mindspore { -namespace kernel { -const int kMaxLSTMLayer = 100; -const int kOutputWorkSpaceIndex = 3; -void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { - CPUKernel::InitInputOutputSize(kernel_node); - output_size_list_[kOutputWorkSpaceIndex] = reserve_size_; - auto output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0); - auto output_types = std::vector(output_num, output_type); - std::vector> output_shapes; - for (size_t output_index = 0; output_index < output_num; ++output_index) { - std::vector shape = AnfAlgo::GetOutputInferShape(kernel_node, output_index); - output_shapes.emplace_back(shape); - } - size_t len = reserve_size_ / 4; - output_shapes[kOutputWorkSpaceIndex] = {len, 1}; - AnfAlgo::SetOutputInferTypeAndShape(output_types, output_shapes, kernel_node.get()); -} - -void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) { -#ifdef PLATFORM_86 - _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); -#endif - MS_EXCEPTION_IF_NULL(kernel_node); - using tag = dnnl::memory::format_tag; - using dim = dnnl::memory::dims; - CheckParam(kernel_node); - auto eng = MKLKernelEngine::Get().engine(); - dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional; - if (bidirectional_) { - direction = dnnl::rnn_direction::bidirectional_concat; - } - dim src_dims = {seq_len_, batch_size_, input_size_}; - dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; - dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; - weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_}; - weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_}; - bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_}; - dim dst_dims = {seq_len_, batch_size_, static_cast(hidden_size_) * num_directions_}; - dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; - dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; - dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc); - dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc); - dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc); - dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo); - dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc); - dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc); - dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc); - if (!kernel_node->HasAttr(kAttrIsTraining)) { - is_training = true; - } else { - is_training = GetValue(kernel_node->GetAttr(kAttrIsTraining)); - } - auto prop_kind = dnnl::prop_kind::forward_training; - if (!is_training) { - prop_kind = dnnl::prop_kind::forward_inference; - } - auto desc = std::make_shared( - prop_kind, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any), - formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc); - prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng); - primitive_ = std::make_shared(prim_desc_); - if (is_training) { - reserve_size_ = static_cast(prim_desc_.workspace_desc().get_size()); - AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc()); - } else { - reserve_size_ = 1; - } - AddArgument(DNNL_ARG_SRC_LAYER, src_desc); - AddArgument(DNNL_ARG_SRC_ITER, src_h_desc); - AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc); - AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc()); - AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc()); - AddArgument(DNNL_ARG_BIAS, bias_desc); - AddArgument(DNNL_ARG_DST_LAYER, dst_desc); - AddArgument(DNNL_ARG_DST_ITER, dst_h_desc); - AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc); -} - -void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) { - std::vector src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - std::vector src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); - std::vector src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); - bidirectional_ = AnfAlgo::GetNodeAttr(kernel_node, "bidirectional"); - input_size_ = static_cast(AnfAlgo::GetNodeAttr(kernel_node, "input_size")); - hidden_size_ = static_cast(AnfAlgo::GetNodeAttr(kernel_node, "hidden_size")); - num_layers_ = static_cast(AnfAlgo::GetNodeAttr(kernel_node, "num_layers")); - has_bias_ = AnfAlgo::GetNodeAttr(kernel_node, "has_bias"); - batch_size_ = SizeToInt(src_shape[1]); - seq_len_ = SizeToInt(src_shape[0]); - num_directions_ = 1; - if (bidirectional_) { - num_directions_ = 2; - } - const int gate_size = 4 * hidden_size_; - if (num_layers_ <= 0) { - MS_LOG(EXCEPTION) << "Layers must be greater than zero!"; - } - if (num_layers_ > kMaxLSTMLayer) { - MS_LOG(EXCEPTION) << "Layers must be lower than 100!"; - } - for (int i = 0; i < num_layers_; ++i) { - weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_); - weight_h_size_ += gate_size * hidden_size_; - } - weight_size_ = weight_size_ * num_directions_; - weight_h_size_ = weight_h_size_ * num_directions_; - if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) { - MS_LOG(EXCEPTION) << "Error iteration shape!"; - } - if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) { - MS_LOG(EXCEPTION) << "Lstm only support 3-D input!"; - } -} - -bool LstmCPUKernel::Launch(const std::vector &inputs, const std::vector &, - const std::vector &outputs) { - using dt = dnnl::memory::data_type; - using tag = dnnl::memory::format_tag; - auto eng = MKLKernelEngine::Get().engine(); - auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); - auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); - auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng); - auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng); - user_weights_memory.set_data_handle(inputs[3]->addr); - user_weights_h_memory.set_data_handle(reinterpret_cast(inputs[3]->addr) + weight_size_); - Reorder(&user_weights_memory, &weights_memory); - Reorder(&user_weights_h_memory, &weights_h_memory); - auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng); - if (has_bias_) { - bias_memory.set_data_handle(reinterpret_cast(inputs[3]->addr) + weight_size_ + weight_h_size_); - } else { - if (memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0, - prim_desc_.bias_desc().get_size())) { - MS_LOG(EXCEPTION) << "Bias memset error"; - } - } - // set handle - SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr); - SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr); - SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr); - SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle()); - SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle()); - SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle()); - SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr); - SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr); - SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr); - if (is_training) { - SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr); - } - ExecutePrimitive(); - return true; -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h" +#include +#include "utils/ms_utils.h" +#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" +#include "runtime/device/cpu/cpu_device_address.h" + +namespace mindspore { +namespace kernel { +const int kMaxLSTMLayer = 100; +const int kOutputWorkSpaceIndex = 3; +void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { + CPUKernel::InitInputOutputSize(kernel_node); + output_size_list_[kOutputWorkSpaceIndex] = reserve_size_; + auto output_num = AnfAlgo::GetOutputTensorNum(kernel_node); + auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0); + auto output_types = std::vector(output_num, output_type); + std::vector> output_shapes; + for (size_t output_index = 0; output_index < output_num; ++output_index) { + std::vector shape = AnfAlgo::GetOutputInferShape(kernel_node, output_index); + output_shapes.emplace_back(shape); + } + size_t len = reserve_size_ / 4; + output_shapes[kOutputWorkSpaceIndex] = {len, 1}; + AnfAlgo::SetOutputInferTypeAndShape(output_types, output_shapes, kernel_node.get()); +} + +void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) { +#ifdef PLATFORM_86 + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +#endif + MS_EXCEPTION_IF_NULL(kernel_node); + using tag = dnnl::memory::format_tag; + using dim = dnnl::memory::dims; + CheckParam(kernel_node); + auto eng = MKLKernelEngine::Get().engine(); + dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional; + if (bidirectional_) { + direction = dnnl::rnn_direction::bidirectional_concat; + } + dim src_dims = {seq_len_, batch_size_, input_size_}; + dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_}; + weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_}; + bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_}; + dim dst_dims = {seq_len_, batch_size_, static_cast(hidden_size_) * num_directions_}; + dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc); + dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc); + dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc); + dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo); + dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc); + dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc); + dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc); + if (!kernel_node->HasAttr(kAttrIsTraining)) { + is_training = true; + } else { + is_training = GetValue(kernel_node->GetAttr(kAttrIsTraining)); + } + auto prop_kind = dnnl::prop_kind::forward_training; + if (!is_training) { + prop_kind = dnnl::prop_kind::forward_inference; + } + auto desc = std::make_shared( + prop_kind, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any), + formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc); + prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng); + primitive_ = std::make_shared(prim_desc_); + if (is_training) { + reserve_size_ = static_cast(prim_desc_.workspace_desc().get_size()); + AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc()); + } else { + reserve_size_ = 1; + } + AddArgument(DNNL_ARG_SRC_LAYER, src_desc); + AddArgument(DNNL_ARG_SRC_ITER, src_h_desc); + AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc); + AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc()); + AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc()); + AddArgument(DNNL_ARG_BIAS, bias_desc); + AddArgument(DNNL_ARG_DST_LAYER, dst_desc); + AddArgument(DNNL_ARG_DST_ITER, dst_h_desc); + AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc); +} + +void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) { + std::vector src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + std::vector src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); + std::vector src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); + bidirectional_ = AnfAlgo::GetNodeAttr(kernel_node, "bidirectional"); + input_size_ = static_cast(AnfAlgo::GetNodeAttr(kernel_node, "input_size")); + hidden_size_ = static_cast(AnfAlgo::GetNodeAttr(kernel_node, "hidden_size")); + num_layers_ = static_cast(AnfAlgo::GetNodeAttr(kernel_node, "num_layers")); + has_bias_ = AnfAlgo::GetNodeAttr(kernel_node, "has_bias"); + batch_size_ = SizeToInt(src_shape[1]); + seq_len_ = SizeToInt(src_shape[0]); + num_directions_ = 1; + if (bidirectional_) { + num_directions_ = 2; + } + const int gate_size = 4 * hidden_size_; + if (num_layers_ <= 0) { + MS_LOG(EXCEPTION) << "Layers must be greater than zero!"; + } + if (num_layers_ > kMaxLSTMLayer) { + MS_LOG(EXCEPTION) << "Layers must be lower than 100!"; + } + for (int i = 0; i < num_layers_; ++i) { + weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_); + weight_h_size_ += gate_size * hidden_size_; + } + weight_size_ = weight_size_ * num_directions_; + weight_h_size_ = weight_h_size_ * num_directions_; + if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) { + MS_LOG(EXCEPTION) << "Error iteration shape!"; + } + if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) { + MS_LOG(EXCEPTION) << "Lstm only support 3-D input!"; + } +} + +bool LstmCPUKernel::Launch(const std::vector &inputs, const std::vector &, + const std::vector &outputs) { + using dt = dnnl::memory::data_type; + using tag = dnnl::memory::format_tag; + auto eng = MKLKernelEngine::Get().engine(); + auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); + auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); + auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng); + auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng); + user_weights_memory.set_data_handle(inputs[3]->addr); + user_weights_h_memory.set_data_handle(reinterpret_cast(inputs[3]->addr) + weight_size_); + Reorder(&user_weights_memory, &weights_memory); + Reorder(&user_weights_h_memory, &weights_h_memory); + auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng); + if (has_bias_) { + bias_memory.set_data_handle(reinterpret_cast(inputs[3]->addr) + weight_size_ + weight_h_size_); + } else { + if (memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0, + prim_desc_.bias_desc().get_size())) { + MS_LOG(EXCEPTION) << "Bias memset error"; + } + } + // set handle + SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr); + SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr); + SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr); + SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle()); + SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle()); + SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle()); + SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr); + SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr); + SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr); + if (is_training) { + SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr); + } + ExecutePrimitive(); + return true; +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h index 79689849cc2..a5e646a2ff3 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h @@ -1,76 +1,76 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_ -#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64) -#define PLATFORM_86 -#endif -#ifdef PLATFORM_86 -#include -#endif -#include -#include -#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" -namespace mindspore { -namespace kernel { -class LstmCPUKernel : public MKLCPUKernel { - public: - LstmCPUKernel() = default; - ~LstmCPUKernel() override = default; - void InitKernel(const CNodePtr &kernel_node) override; - - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - protected: - void InitInputOutputSize(const CNodePtr &kernel_node) override; - - private: - void CheckParam(const CNodePtr &kernel_node); - int weight_size_ = 0; - int weight_h_size_ = 0; - int input_size_; - int hidden_size_; - int num_layers_; - int batch_size_; - int seq_len_; - int num_directions_; - bool bidirectional_; - bool has_bias_; - size_t reserve_size_; - bool is_training; - dnnl::memory::dims weights_dims_; - dnnl::memory::dims weights_h_dims_; - dnnl::memory::dims bias_dims_; - dnnl::lstm_forward::primitive_desc prim_desc_; -}; - -MS_REG_CPU_KERNEL(LSTM, - KernelAttr() - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32), - LstmCPUKernel); -} // namespace kernel -} // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_ +#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64) +#define PLATFORM_86 +#endif +#ifdef PLATFORM_86 +#include +#endif +#include +#include +#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" +namespace mindspore { +namespace kernel { +class LstmCPUKernel : public MKLCPUKernel { + public: + LstmCPUKernel() = default; + ~LstmCPUKernel() override = default; + void InitKernel(const CNodePtr &kernel_node) override; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + protected: + void InitInputOutputSize(const CNodePtr &kernel_node) override; + + private: + void CheckParam(const CNodePtr &kernel_node); + int weight_size_ = 0; + int weight_h_size_ = 0; + int input_size_; + int hidden_size_; + int num_layers_; + int batch_size_; + int seq_len_; + int num_directions_; + bool bidirectional_; + bool has_bias_; + size_t reserve_size_; + bool is_training; + dnnl::memory::dims weights_dims_; + dnnl::memory::dims weights_h_dims_; + dnnl::memory::dims bias_dims_; + dnnl::lstm_forward::primitive_desc prim_desc_; +}; + +MS_REG_CPU_KERNEL(LSTM, + KernelAttr() + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32), + LstmCPUKernel); +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc index bb614ed5c04..e5a3c742468 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc @@ -1,218 +1,218 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h" -#include -#include -#include "utils/ms_utils.h" -#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" -#include "runtime/device/cpu/cpu_device_address.h" - -namespace mindspore { -namespace kernel { -const int kMaxLSTMLayer = 100; -const int kInputWorkSpaceIndex = 10; -void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { - CPUKernel::InitInputOutputSize(kernel_node); - input_size_list_[kInputWorkSpaceIndex] = reserve_size_; -} - -void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - using tag = dnnl::memory::format_tag; - using dim = dnnl::memory::dims; - CheckParam(kernel_node); - auto eng = MKLKernelEngine::Get().engine(); - dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional; - if (bidirectional_) { - direction = dnnl::rnn_direction::bidirectional_concat; - } - dim src_dims = {seq_len_, batch_size_, input_size_}; - dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; - dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; - weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_}; - weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_}; - bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_}; - dim dst_dims = {seq_len_, batch_size_, static_cast(hidden_size_) * num_directions_}; - dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; - dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; - dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc); - dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc); - dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc); - dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo); - dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc); - dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc); - dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc); - auto forward_desc = std::make_shared( - dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc, - formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, - dst_c_desc); - auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng); - auto backward_desc = std::make_shared( - dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any), - formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc, - src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, - dst_h_desc, dst_c_desc); - prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc); - primitive_ = std::make_shared(prim_backward_desc_); - reserve_size_ = static_cast(prim_forward_desc.workspace_desc().get_size()); - AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc()); - AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc); -} - -void LSTMGradCPUKernel::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc, - const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc, - const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc, - const dnnl::memory::desc &dst_c_desc) { - AddArgument(DNNL_ARG_SRC_LAYER, src_desc); - AddArgument(DNNL_ARG_SRC_ITER, src_h_desc); - AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc); - AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc()); - AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc()); - AddArgument(DNNL_ARG_BIAS, bias_desc); - AddArgument(DNNL_ARG_DST_LAYER, dst_desc); - AddArgument(DNNL_ARG_DST_ITER, dst_h_desc); - AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc); - AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc); - AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc); - AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc); - AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc()); - AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc()); - AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc); - AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc); - AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc); - AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc); -} - -void LSTMGradCPUKernel::CheckParam(const CNodePtr &kernel_node) { - std::vector src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - std::vector src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); - std::vector src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); - bidirectional_ = AnfAlgo::GetNodeAttr(kernel_node, "bidirectional"); - input_size_ = AnfAlgo::GetNodeAttr(kernel_node, "input_size"); - hidden_size_ = AnfAlgo::GetNodeAttr(kernel_node, "hidden_size"); - num_layers_ = AnfAlgo::GetNodeAttr(kernel_node, "num_layers"); - has_bias_ = AnfAlgo::GetNodeAttr(kernel_node, "has_bias"); - batch_size_ = SizeToInt(src_shape[1]); - seq_len_ = SizeToInt(src_shape[0]); - num_directions_ = 1; - if (bidirectional_) { - num_directions_ = 2; - } - const int64_t gate_size = 4 * hidden_size_; - if (num_layers_ <= 0) { - MS_LOG(EXCEPTION) << "Layers must be greater than zero!"; - } - if (num_layers_ > kMaxLSTMLayer) { - MS_LOG(EXCEPTION) << "Layers must be lower than 100!"; - } - for (int64_t i = 0; i < num_layers_; ++i) { - weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_); - weight_h_size_ += gate_size * hidden_size_; - } - weight_size_ = weight_size_ * num_directions_; - weight_h_size_ = weight_h_size_ * num_directions_; - if (num_directions_ * num_layers_ != SizeToLong(src_h_shape[0])) { - MS_LOG(EXCEPTION) << "Error iteration shape!"; - } - if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) { - MS_LOG(EXCEPTION) << "Lstm only support 3-D input!"; - } -} - -void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector &inputs, - const std::vector &outputs, - const dnnl::memory &weights_memory, const dnnl::memory &weights_h_memory, - const dnnl::memory &bias_memory, const dnnl::memory &diff_weights_memory, - const dnnl::memory &diff_weights_h_memory, - const dnnl::memory &diff_bias_memory) { - SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr); - SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr); - SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr); - SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle()); - SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle()); - SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle()); - SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr); - SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr); - SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr); - SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr); - SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr); - SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr); - SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr); - SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle()); - SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle()); - SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle()); - SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr); - SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr); - SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr); -} - -void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const { - if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) { - MS_LOG(EXCEPTION) << name << " memset error"; - } -} - -bool LSTMGradCPUKernel::Launch(const std::vector &inputs, const std::vector &, - const std::vector &outputs) { - using dt = dnnl::memory::data_type; - using tag = dnnl::memory::format_tag; - auto eng = MKLKernelEngine::Get().engine(); - // construct fw memory - auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); - auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); - auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng); - auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng); - auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng); - user_weights_memory.set_data_handle(inputs[3]->addr); - user_weights_h_memory.set_data_handle(reinterpret_cast(inputs[3]->addr) + weight_size_); - Reorder(&user_weights_memory, &weights_memory); - Reorder(&user_weights_h_memory, &weights_h_memory); - if (has_bias_) { - bias_memory.set_data_handle(reinterpret_cast(inputs[3]->addr) + weight_size_ + weight_h_size_); - } else { - if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0, - prim_backward_desc_.bias_desc().get_size())) { - MS_LOG(EXCEPTION) << "Bias memset error"; - } - } - // construct bw memory - auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng); - auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng); - auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng); - auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); - auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); - user_diff_weights_memory.set_data_handle(outputs[3]->addr); - user_diff_weights_h_memory.set_data_handle(reinterpret_cast(outputs[3]->addr) + weight_size_); - ResetMemory(user_diff_weights_memory, "user weights grad"); - ResetMemory(user_diff_weights_h_memory, "user weights iter grad"); - ResetMemory(diff_weights_memory, "weights grad"); - ResetMemory(diff_weights_h_memory, "weights iter grad"); - if (has_bias_) { - diff_bias_memory.set_data_handle(reinterpret_cast(outputs[3]->addr) + weight_size_ + weight_h_size_); - } - if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0, - prim_backward_desc_.diff_bias_desc().get_size())) { - MS_LOG(EXCEPTION) << "Bias grad memset error"; - } - SetArgumentHandleOp(inputs, outputs, weights_memory, weights_h_memory, bias_memory, diff_weights_memory, - diff_weights_h_memory, diff_bias_memory); - ExecutePrimitive(); - Reorder(&diff_weights_memory, &user_diff_weights_memory); - Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory); - return true; -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h" +#include +#include +#include "utils/ms_utils.h" +#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" +#include "runtime/device/cpu/cpu_device_address.h" + +namespace mindspore { +namespace kernel { +const int kMaxLSTMLayer = 100; +const int kInputWorkSpaceIndex = 10; +void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { + CPUKernel::InitInputOutputSize(kernel_node); + input_size_list_[kInputWorkSpaceIndex] = reserve_size_; +} + +void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + using tag = dnnl::memory::format_tag; + using dim = dnnl::memory::dims; + CheckParam(kernel_node); + auto eng = MKLKernelEngine::Get().engine(); + dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional; + if (bidirectional_) { + direction = dnnl::rnn_direction::bidirectional_concat; + } + dim src_dims = {seq_len_, batch_size_, input_size_}; + dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_}; + weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_}; + bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_}; + dim dst_dims = {seq_len_, batch_size_, static_cast(hidden_size_) * num_directions_}; + dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc); + dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc); + dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc); + dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo); + dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc); + dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc); + dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc); + auto forward_desc = std::make_shared( + dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc, + formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, + dst_c_desc); + auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng); + auto backward_desc = std::make_shared( + dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any), + formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc, + src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, + dst_h_desc, dst_c_desc); + prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc); + primitive_ = std::make_shared(prim_backward_desc_); + reserve_size_ = static_cast(prim_forward_desc.workspace_desc().get_size()); + AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc()); + AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc); +} + +void LSTMGradCPUKernel::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc, + const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc, + const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc, + const dnnl::memory::desc &dst_c_desc) { + AddArgument(DNNL_ARG_SRC_LAYER, src_desc); + AddArgument(DNNL_ARG_SRC_ITER, src_h_desc); + AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc); + AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc()); + AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc()); + AddArgument(DNNL_ARG_BIAS, bias_desc); + AddArgument(DNNL_ARG_DST_LAYER, dst_desc); + AddArgument(DNNL_ARG_DST_ITER, dst_h_desc); + AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc); + AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc); + AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc); + AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc); + AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc()); + AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc()); + AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc); + AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc); + AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc); + AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc); +} + +void LSTMGradCPUKernel::CheckParam(const CNodePtr &kernel_node) { + std::vector src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + std::vector src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); + std::vector src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); + bidirectional_ = AnfAlgo::GetNodeAttr(kernel_node, "bidirectional"); + input_size_ = AnfAlgo::GetNodeAttr(kernel_node, "input_size"); + hidden_size_ = AnfAlgo::GetNodeAttr(kernel_node, "hidden_size"); + num_layers_ = AnfAlgo::GetNodeAttr(kernel_node, "num_layers"); + has_bias_ = AnfAlgo::GetNodeAttr(kernel_node, "has_bias"); + batch_size_ = SizeToInt(src_shape[1]); + seq_len_ = SizeToInt(src_shape[0]); + num_directions_ = 1; + if (bidirectional_) { + num_directions_ = 2; + } + const int64_t gate_size = 4 * hidden_size_; + if (num_layers_ <= 0) { + MS_LOG(EXCEPTION) << "Layers must be greater than zero!"; + } + if (num_layers_ > kMaxLSTMLayer) { + MS_LOG(EXCEPTION) << "Layers must be lower than 100!"; + } + for (int64_t i = 0; i < num_layers_; ++i) { + weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_); + weight_h_size_ += gate_size * hidden_size_; + } + weight_size_ = weight_size_ * num_directions_; + weight_h_size_ = weight_h_size_ * num_directions_; + if (num_directions_ * num_layers_ != SizeToLong(src_h_shape[0])) { + MS_LOG(EXCEPTION) << "Error iteration shape!"; + } + if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) { + MS_LOG(EXCEPTION) << "Lstm only support 3-D input!"; + } +} + +void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector &inputs, + const std::vector &outputs, + const dnnl::memory &weights_memory, const dnnl::memory &weights_h_memory, + const dnnl::memory &bias_memory, const dnnl::memory &diff_weights_memory, + const dnnl::memory &diff_weights_h_memory, + const dnnl::memory &diff_bias_memory) { + SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr); + SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr); + SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr); + SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle()); + SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle()); + SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle()); + SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr); + SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr); + SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr); + SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr); + SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr); + SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr); + SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr); + SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle()); + SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle()); + SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle()); + SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr); + SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr); + SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr); +} + +void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const { + if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) { + MS_LOG(EXCEPTION) << name << " memset error"; + } +} + +bool LSTMGradCPUKernel::Launch(const std::vector &inputs, const std::vector &, + const std::vector &outputs) { + using dt = dnnl::memory::data_type; + using tag = dnnl::memory::format_tag; + auto eng = MKLKernelEngine::Get().engine(); + // construct fw memory + auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); + auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); + auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng); + auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng); + auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng); + user_weights_memory.set_data_handle(inputs[3]->addr); + user_weights_h_memory.set_data_handle(reinterpret_cast(inputs[3]->addr) + weight_size_); + Reorder(&user_weights_memory, &weights_memory); + Reorder(&user_weights_h_memory, &weights_h_memory); + if (has_bias_) { + bias_memory.set_data_handle(reinterpret_cast(inputs[3]->addr) + weight_size_ + weight_h_size_); + } else { + if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0, + prim_backward_desc_.bias_desc().get_size())) { + MS_LOG(EXCEPTION) << "Bias memset error"; + } + } + // construct bw memory + auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng); + auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng); + auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng); + auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); + auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); + user_diff_weights_memory.set_data_handle(outputs[3]->addr); + user_diff_weights_h_memory.set_data_handle(reinterpret_cast(outputs[3]->addr) + weight_size_); + ResetMemory(user_diff_weights_memory, "user weights grad"); + ResetMemory(user_diff_weights_h_memory, "user weights iter grad"); + ResetMemory(diff_weights_memory, "weights grad"); + ResetMemory(diff_weights_h_memory, "weights iter grad"); + if (has_bias_) { + diff_bias_memory.set_data_handle(reinterpret_cast(outputs[3]->addr) + weight_size_ + weight_h_size_); + } + if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0, + prim_backward_desc_.diff_bias_desc().get_size())) { + MS_LOG(EXCEPTION) << "Bias grad memset error"; + } + SetArgumentHandleOp(inputs, outputs, weights_memory, weights_h_memory, bias_memory, diff_weights_memory, + diff_weights_h_memory, diff_bias_memory); + ExecutePrimitive(); + Reorder(&diff_weights_memory, &user_diff_weights_memory); + Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory); + return true; +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h index 0e5a08ac64c..ad008d6a952 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h @@ -1,87 +1,87 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ - -#include -#include -#include -#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" - -namespace mindspore { -namespace kernel { -class LSTMGradCPUKernel : public MKLCPUKernel { - public: - LSTMGradCPUKernel() = default; - ~LSTMGradCPUKernel() override = default; - - void InitKernel(const CNodePtr &kernel_node) override; - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - protected: - void InitInputOutputSize(const CNodePtr &kernel_node) override; - - private: - void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc, - const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc, - const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc, - const dnnl::memory::desc &dst_c_desc); - void SetArgumentHandleOp(const std::vector &inputs, - const std::vector &outputs, const dnnl::memory &weights_memory, - const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory, - const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory, - const dnnl::memory &diff_bias_memory); - void ResetMemory(const dnnl::memory &mem, const string name) const; - void CheckParam(const CNodePtr &kernel_node); - int64_t weight_size_ = 0; - int64_t weight_h_size_ = 0; - int64_t input_size_; - int64_t hidden_size_; - int64_t num_layers_; - int64_t batch_size_; - int64_t seq_len_; - int num_directions_; - bool bidirectional_; - bool has_bias_; - size_t reserve_size_; - dnnl::memory::dims weights_dims_; - dnnl::memory::dims weights_h_dims_; - dnnl::memory::dims bias_dims_; - dnnl::lstm_backward::primitive_desc prim_backward_desc_; -}; - -MS_REG_CPU_KERNEL(LSTMGrad, - KernelAttr() - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32), - LSTMGradCPUKernel); -} // namespace kernel -} // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ + +#include +#include +#include +#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" + +namespace mindspore { +namespace kernel { +class LSTMGradCPUKernel : public MKLCPUKernel { + public: + LSTMGradCPUKernel() = default; + ~LSTMGradCPUKernel() override = default; + + void InitKernel(const CNodePtr &kernel_node) override; + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + protected: + void InitInputOutputSize(const CNodePtr &kernel_node) override; + + private: + void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc, + const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc, + const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc, + const dnnl::memory::desc &dst_c_desc); + void SetArgumentHandleOp(const std::vector &inputs, + const std::vector &outputs, const dnnl::memory &weights_memory, + const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory, + const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory, + const dnnl::memory &diff_bias_memory); + void ResetMemory(const dnnl::memory &mem, const string name) const; + void CheckParam(const CNodePtr &kernel_node); + int64_t weight_size_ = 0; + int64_t weight_h_size_ = 0; + int64_t input_size_; + int64_t hidden_size_; + int64_t num_layers_; + int64_t batch_size_; + int64_t seq_len_; + int num_directions_; + bool bidirectional_; + bool has_bias_; + size_t reserve_size_; + dnnl::memory::dims weights_dims_; + dnnl::memory::dims weights_h_dims_; + dnnl::memory::dims bias_dims_; + dnnl::lstm_backward::primitive_desc prim_backward_desc_; +}; + +MS_REG_CPU_KERNEL(LSTMGrad, + KernelAttr() + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32), + LSTMGradCPUKernel); +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc index 2dd2a8a3540..9c02fb497ca 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc @@ -1,99 +1,99 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h" -#include -#include -#include -#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" -#include "runtime/device/cpu/cpu_device_address.h" -#include "utils/ms_utils.h" - -namespace mindspore { -namespace kernel { -void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { - CPUKernel::InitInputOutputSize(kernel_node); - MS_EXCEPTION_IF_NULL(kernel_node); - size_t type_size = sizeof(float); - std::vector shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies()); - workspace_size_list_.emplace_back(tensor_size); -} - -void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - std::vector shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - dnnl::memory::dims mem_dims; - mem_dims.insert(mem_dims.end(), shape.begin(), shape.end()); - if (mem_dims.size() != 2) { - MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size(); - } - batch_size_ = shape[0]; - class_num_ = shape[1]; - if (batch_size_ == 0 || class_num_ == 0) { - MS_LOG(EXCEPTION) << "Invalid batch size or class num input!"; - } - dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc); - - dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1); - auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine()); - primitive_ = std::make_shared(prim_desc); - - AddArgument(DNNL_ARG_SRC, mem_desc); - AddArgument(DNNL_ARG_DST, mem_desc); -} - -void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels, - float *output1, float *output2) const { - float epsilon = 1e-6; - for (size_t i = 0; i < batch_size_; ++i) { - output1[i] = 0; - float loss = 0.0; - for (size_t j = 0; j < class_num_; ++j) { - float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]); - output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j]; - loss += labels[i * class_num_ + j] * logit; - } - output1[i] = -loss; - } -} - -bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector &inputs, - const std::vector &workspace, - const std::vector &outputs) { - if (inputs.empty() || workspace.empty() || outputs.empty()) { - MS_LOG(EXCEPTION) << "Error input output size!"; - } - size_t batch_float_size = batch_size_ * sizeof(float); - size_t batch_class_float_size = class_num_ * batch_float_size; - if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size || - inputs[1]->size != batch_class_float_size) { - MS_LOG(EXCEPTION) << "Error input data size!"; - } - if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) { - MS_LOG(EXCEPTION) << "Error output data size!"; - } - SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr); - SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr); - ExecutePrimitive(); - auto labels = reinterpret_cast(inputs[1]->addr); - auto logits = reinterpret_cast(workspace[0]->addr); - auto output1 = reinterpret_cast(outputs[0]->addr); - auto output2 = reinterpret_cast(outputs[1]->addr); - ForwardPostExecute(logits, labels, output1, output2); - return true; -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h" +#include +#include +#include +#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" +#include "runtime/device/cpu/cpu_device_address.h" +#include "utils/ms_utils.h" + +namespace mindspore { +namespace kernel { +void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { + CPUKernel::InitInputOutputSize(kernel_node); + MS_EXCEPTION_IF_NULL(kernel_node); + size_t type_size = sizeof(float); + std::vector shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies()); + workspace_size_list_.emplace_back(tensor_size); +} + +void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + std::vector shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + dnnl::memory::dims mem_dims; + mem_dims.insert(mem_dims.end(), shape.begin(), shape.end()); + if (mem_dims.size() != 2) { + MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size(); + } + batch_size_ = shape[0]; + class_num_ = shape[1]; + if (batch_size_ == 0 || class_num_ == 0) { + MS_LOG(EXCEPTION) << "Invalid batch size or class num input!"; + } + dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc); + + dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1); + auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine()); + primitive_ = std::make_shared(prim_desc); + + AddArgument(DNNL_ARG_SRC, mem_desc); + AddArgument(DNNL_ARG_DST, mem_desc); +} + +void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels, + float *output1, float *output2) const { + float epsilon = 1e-6; + for (size_t i = 0; i < batch_size_; ++i) { + output1[i] = 0; + float loss = 0.0; + for (size_t j = 0; j < class_num_; ++j) { + float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]); + output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j]; + loss += labels[i * class_num_ + j] * logit; + } + output1[i] = -loss; + } +} + +bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs) { + if (inputs.empty() || workspace.empty() || outputs.empty()) { + MS_LOG(EXCEPTION) << "Error input output size!"; + } + size_t batch_float_size = batch_size_ * sizeof(float); + size_t batch_class_float_size = class_num_ * batch_float_size; + if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size || + inputs[1]->size != batch_class_float_size) { + MS_LOG(EXCEPTION) << "Error input data size!"; + } + if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) { + MS_LOG(EXCEPTION) << "Error output data size!"; + } + SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr); + SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr); + ExecutePrimitive(); + auto labels = reinterpret_cast(inputs[1]->addr); + auto logits = reinterpret_cast(workspace[0]->addr); + auto output1 = reinterpret_cast(outputs[0]->addr); + auto output2 = reinterpret_cast(outputs[1]->addr); + ForwardPostExecute(logits, labels, output1, output2); + return true; +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h index cc9346fe1bf..367b12d93a1 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h @@ -1,53 +1,53 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ - -#include -#include -#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" - -namespace mindspore { -namespace kernel { -class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel { - public: - SoftmaxCrossEntropyWithLogitsCPUKernel() = default; - ~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default; - - void InitKernel(const CNodePtr &kernel_node) override; - - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - protected: - void InitInputOutputSize(const CNodePtr &kernel_node) override; - - private: - void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const; - size_t class_num_{0}; - size_t batch_size_{0}; -}; -MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits, - KernelAttr() - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32), - SoftmaxCrossEntropyWithLogitsCPUKernel); -} // namespace kernel -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ + +#include +#include +#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" + +namespace mindspore { +namespace kernel { +class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel { + public: + SoftmaxCrossEntropyWithLogitsCPUKernel() = default; + ~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default; + + void InitKernel(const CNodePtr &kernel_node) override; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + protected: + void InitInputOutputSize(const CNodePtr &kernel_node) override; + + private: + void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const; + size_t class_num_{0}; + size_t batch_size_{0}; +}; +MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits, + KernelAttr() + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32), + SoftmaxCrossEntropyWithLogitsCPUKernel); +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pserver_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pserver_kernel.h index c36466d580d..f1d05694de4 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pserver_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pserver_kernel.h @@ -1,59 +1,59 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ - -#include -#include -#include "backend/kernel_compiler/kernel.h" -#include "ps/util.h" - -namespace mindspore { -namespace kernel { -namespace ps { -using mindspore::ps::Util; -class PServerKernel { - public: - PServerKernel(size_t rank_id, size_t pserver_num, size_t worker_num) - : rank_id_(rank_id), pserver_num_(pserver_num), worker_num_(worker_num) {} - ~PServerKernel() = default; - PServerKernel(const PServerKernel &) = delete; - PServerKernel &operator=(const PServerKernel &) = delete; - virtual void InitKernel(const std::shared_ptr>>> &) {} - virtual void InitKernel(const CNodePtr &cnode, - const std::shared_ptr>>> &) {} - virtual void ReInit(const std::vector> &) {} - virtual bool Execute(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) = 0; - virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals, - size_t ids_size) {} - virtual const std::vector &input_sizes() const = 0; - virtual const std::vector &output_sizes() const = 0; - virtual const std::vector &workspace_sizes() const = 0; - - protected: - virtual void ReInit(const std::vector &) {} - void Shard(std::vector *shape, int axis); - - size_t rank_id_; - size_t pserver_num_; - size_t worker_num_; -}; -} // namespace ps -} // namespace kernel -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ + +#include +#include +#include "backend/kernel_compiler/kernel.h" +#include "ps/util.h" + +namespace mindspore { +namespace kernel { +namespace ps { +using mindspore::ps::Util; +class PServerKernel { + public: + PServerKernel(size_t rank_id, size_t pserver_num, size_t worker_num) + : rank_id_(rank_id), pserver_num_(pserver_num), worker_num_(worker_num) {} + ~PServerKernel() = default; + PServerKernel(const PServerKernel &) = delete; + PServerKernel &operator=(const PServerKernel &) = delete; + virtual void InitKernel(const std::shared_ptr>>> &) {} + virtual void InitKernel(const CNodePtr &cnode, + const std::shared_ptr>>> &) {} + virtual void ReInit(const std::vector> &) {} + virtual bool Execute(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) = 0; + virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals, + size_t ids_size) {} + virtual const std::vector &input_sizes() const = 0; + virtual const std::vector &output_sizes() const = 0; + virtual const std::vector &workspace_sizes() const = 0; + + protected: + virtual void ReInit(const std::vector &) {} + void Shard(std::vector *shape, int axis); + + size_t rank_id_; + size_t pserver_num_; + size_t worker_num_; +}; +} // namespace ps +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc index c9aad40e1af..622cffb5cbd 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc @@ -1,138 +1,138 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h" -#include -#include -#include -#include - -namespace mindspore { -namespace kernel { -template -void ReduceCPUKernel::InitKernel(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS); - if (axis_addr->isa() || axis_addr->isa()) { - axis_ = AnfAlgo::GetNodeAttr>(kernel_node, AXIS); - } else if (axis_addr->isa()) { - axis_.emplace_back(AnfAlgo::GetNodeAttr(kernel_node, AXIS)); - } else { - MS_LOG(EXCEPTION) << "Attribute is invalid"; - } - - int dimension = input_shape_.size(); - std::transform(axis_.begin(), axis_.end(), axis_.begin(), - [dimension](const auto &a) { return a < 0 ? dimension + a : a; }); - sort(axis_.begin(), axis_.end()); - // Delete the duplicate axis. - auto last = std::unique(axis_.begin(), axis_.end()); - axis_.erase(last, axis_.end()); - auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); - - if constexpr (std::is_same::value) { - if (kernel_name == "ReduceAll") { - reduce_type_ = kReduceAll; - reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; }; - } else if (kernel_name == "ReduceAny") { - reduce_type_ = kReduceAny; - reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; }; - } else { - MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool."; - } - } else { - if (kernel_name == "ReduceMax") { - reduce_type_ = kReduceMax; - reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); }; - } else if (kernel_name == "ReduceMin") { - reduce_type_ = kReduceMin; - reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); }; - } else if (kernel_name == "ReduceSum") { - reduce_type_ = kReduceSum; - reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; - } else if (kernel_name == "ReduceMean") { - reduce_type_ = kReduceMean; - reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; - } else { - MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name; - } - } -} - -template -bool ReduceCPUKernel::Launch(const std::vector &inputs, const std::vector &, - const std::vector &outputs) { - size_t input_size = inputs[0]->size / sizeof(T); - auto input_addr = reinterpret_cast(inputs[0]->addr); - auto output_addr = reinterpret_cast(outputs[0]->addr); - if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) { - // Get one ret - *output_addr = input_addr[0]; - for (size_t i = 1; i < input_size; ++i) { - reduce_func_(input_addr, i, output_addr); - } - if (reduce_type_ == kReduceMean) { - *output_addr /= input_size; - } - } else { - // Calculate transpose axes and stride - int dimension = input_shape_.size(); - size_t stride = 1; - std::vector axes(input_shape_.size()); - size_t j = 0; - size_t k = 0; - for (int i = 0; i < dimension; ++i) { - if (j == axis_.size() || i != axis_[j]) { - axes[k] = i; - ++k; - } else { - stride *= input_shape_[i]; - ++j; - } - } - for (auto &it : axis_) { - axes[k] = it; - ++k; - } - // Calculate transpose shape - std::vector transpose_shape(input_shape_.size()); - for (int i = 0; i < dimension; ++i) { - transpose_shape[i] = input_shape_[axes[i]]; - } - size_t output_size = outputs[0]->size / sizeof(T); - TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_); - auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) { - auto iter = base_iter; - iter.SetPos(start * stride); - for (size_t i = start; i < end; ++i) { - output_addr[i] = input_addr[iter.GetPos()]; - iter.GenNextPos(); - for (size_t j = 1; j < stride; ++j) { - reduce_func_(input_addr, iter.GetPos(), &output_addr[i]); - iter.GenNextPos(); - } - if (reduce_type_ == kReduceMean) { - output_addr[i] /= stride; - } - } - }; - CPUKernelUtils::ParallelFor(task, output_size); - } - return true; -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h" +#include +#include +#include +#include + +namespace mindspore { +namespace kernel { +template +void ReduceCPUKernel::InitKernel(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS); + if (axis_addr->isa() || axis_addr->isa()) { + axis_ = AnfAlgo::GetNodeAttr>(kernel_node, AXIS); + } else if (axis_addr->isa()) { + axis_.emplace_back(AnfAlgo::GetNodeAttr(kernel_node, AXIS)); + } else { + MS_LOG(EXCEPTION) << "Attribute is invalid"; + } + + int dimension = input_shape_.size(); + std::transform(axis_.begin(), axis_.end(), axis_.begin(), + [dimension](const auto &a) { return a < 0 ? dimension + a : a; }); + sort(axis_.begin(), axis_.end()); + // Delete the duplicate axis. + auto last = std::unique(axis_.begin(), axis_.end()); + axis_.erase(last, axis_.end()); + auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); + + if constexpr (std::is_same::value) { + if (kernel_name == "ReduceAll") { + reduce_type_ = kReduceAll; + reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; }; + } else if (kernel_name == "ReduceAny") { + reduce_type_ = kReduceAny; + reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; }; + } else { + MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool."; + } + } else { + if (kernel_name == "ReduceMax") { + reduce_type_ = kReduceMax; + reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); }; + } else if (kernel_name == "ReduceMin") { + reduce_type_ = kReduceMin; + reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); }; + } else if (kernel_name == "ReduceSum") { + reduce_type_ = kReduceSum; + reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; + } else if (kernel_name == "ReduceMean") { + reduce_type_ = kReduceMean; + reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; + } else { + MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name; + } + } +} + +template +bool ReduceCPUKernel::Launch(const std::vector &inputs, const std::vector &, + const std::vector &outputs) { + size_t input_size = inputs[0]->size / sizeof(T); + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto output_addr = reinterpret_cast(outputs[0]->addr); + if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) { + // Get one ret + *output_addr = input_addr[0]; + for (size_t i = 1; i < input_size; ++i) { + reduce_func_(input_addr, i, output_addr); + } + if (reduce_type_ == kReduceMean) { + *output_addr /= input_size; + } + } else { + // Calculate transpose axes and stride + int dimension = input_shape_.size(); + size_t stride = 1; + std::vector axes(input_shape_.size()); + size_t j = 0; + size_t k = 0; + for (int i = 0; i < dimension; ++i) { + if (j == axis_.size() || i != axis_[j]) { + axes[k] = i; + ++k; + } else { + stride *= input_shape_[i]; + ++j; + } + } + for (auto &it : axis_) { + axes[k] = it; + ++k; + } + // Calculate transpose shape + std::vector transpose_shape(input_shape_.size()); + for (int i = 0; i < dimension; ++i) { + transpose_shape[i] = input_shape_[axes[i]]; + } + size_t output_size = outputs[0]->size / sizeof(T); + TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_); + auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) { + auto iter = base_iter; + iter.SetPos(start * stride); + for (size_t i = start; i < end; ++i) { + output_addr[i] = input_addr[iter.GetPos()]; + iter.GenNextPos(); + for (size_t j = 1; j < stride; ++j) { + reduce_func_(input_addr, iter.GetPos(), &output_addr[i]); + iter.GenNextPos(); + } + if (reduce_type_ == kReduceMean) { + output_addr[i] /= stride; + } + } + }; + CPUKernelUtils::ParallelFor(task, output_size); + } + return true; +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h index 65229891286..775c649b2f5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h @@ -1,69 +1,69 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ -#include -#include -#include -#include -#include "backend/kernel_compiler/cpu/cpu_kernel.h" -#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" - -namespace mindspore { -namespace kernel { -template -class ReduceCPUKernel : public CPUKernel { - public: - ReduceCPUKernel() = default; - ~ReduceCPUKernel() override = default; - void InitKernel(const CNodePtr &kernel_node) override; - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - private: - enum ReduceType { kReduceAll, kReduceAny, kReduceMax, kReduceMin, kReduceSum, kReduceMean }; - std::vector input_shape_; - std::vector axis_; - ReduceType reduce_type_{kReduceAll}; - std::function reduce_func_; -}; - -MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float); -MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, double); -MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int32_t); -MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int64_t); - -MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, float); -MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, double); -MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int32_t); -MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int64_t); - -MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, float); -MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, double); -MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int32_t); -MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int64_t); - -MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, float); -MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, double); -MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int32_t); -MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int64_t); - -MS_REG_CPU_KERNEL_T(ReduceAll, KernelAttr(), ReduceCPUKernel, bool); - -MS_REG_CPU_KERNEL_T(ReduceAny, KernelAttr(), ReduceCPUKernel, bool); -} // namespace kernel -} // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ +#include +#include +#include +#include +#include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" + +namespace mindspore { +namespace kernel { +template +class ReduceCPUKernel : public CPUKernel { + public: + ReduceCPUKernel() = default; + ~ReduceCPUKernel() override = default; + void InitKernel(const CNodePtr &kernel_node) override; + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + enum ReduceType { kReduceAll, kReduceAny, kReduceMax, kReduceMin, kReduceSum, kReduceMean }; + std::vector input_shape_; + std::vector axis_; + ReduceType reduce_type_{kReduceAll}; + std::function reduce_func_; +}; + +MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float); +MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, double); +MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int32_t); +MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int64_t); + +MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, float); +MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, double); +MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int32_t); +MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int64_t); + +MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, float); +MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, double); +MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int32_t); +MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int64_t); + +MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, float); +MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, double); +MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int32_t); +MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int64_t); + +MS_REG_CPU_KERNEL_T(ReduceAll, KernelAttr(), ReduceCPUKernel, bool); + +MS_REG_CPU_KERNEL_T(ReduceAny, KernelAttr(), ReduceCPUKernel, bool); +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.cc index e28142b6599..a7298e35194 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.cc @@ -1,91 +1,91 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h" - -#include - -#include "runtime/device/cpu/cpu_device_address.h" - -namespace mindspore { -namespace kernel { -template -void SpaceToDepthCPUKernel::InitKernel(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - CheckParam(kernel_node); - - input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); - block_size_ = AnfAlgo::GetNodeAttr(kernel_node, "block_size"); -} - -template -bool SpaceToDepthCPUKernel::Launch(const std::vector &inputs, - const std::vector & /*workspace*/, - const std::vector &outputs) { - auto input_addr = reinterpret_cast(inputs[0]->addr); - auto output_addr = reinterpret_cast(outputs[0]->addr); - size_t size = IntToSize(inputs[0]->size / sizeof(T)); - - std::vector input_shape = input_shape_; - std::vector output_shape = output_shape_; - size_t block_size = block_size_; - size_t input_dimension = input_shape.size(); - size_t input_strides[3] = {1, 1, 1}; - - for (size_t i = input_dimension - 1; i >= 1; --i) { - for (size_t j = 0; j < i; ++j) { - input_strides[j] *= input_shape[i]; - } - } - - auto task = [&, input_addr, output_addr](size_t start, size_t end) { - std::vector input_pos_array(input_dimension, 0); - for (size_t i = start; i < end; ++i) { - size_t tmp_pos = i; - for (size_t j = 0; j < input_dimension - 1; ++j) { - input_pos_array[j] = tmp_pos / input_strides[j]; - tmp_pos %= input_strides[j]; - } - input_pos_array.back() = tmp_pos; - size_t output_pos = input_pos_array[0]; - output_pos = - (output_pos * output_shape[1]) + - (input_pos_array[1] + - (block_size * (input_pos_array[2] % block_size) + input_pos_array[3] % block_size) * input_shape[1]); - output_pos = (output_pos * output_shape[2]) + (input_pos_array[2] / block_size); - output_pos = (output_pos * output_shape[3]) + (input_pos_array[3] / block_size); - output_addr[output_pos] = input_addr[i]; - } - }; - - CPUKernelUtils::ParallelFor(task, size); - return true; -} - -template -void SpaceToDepthCPUKernel::CheckParam(const CNodePtr &kernel_node) { - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - if (input_num != 1) { - MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input."; - } - size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - if (output_num != 1) { - MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output."; - } -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h" + +#include + +#include "runtime/device/cpu/cpu_device_address.h" + +namespace mindspore { +namespace kernel { +template +void SpaceToDepthCPUKernel::InitKernel(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + CheckParam(kernel_node); + + input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); + block_size_ = AnfAlgo::GetNodeAttr(kernel_node, "block_size"); +} + +template +bool SpaceToDepthCPUKernel::Launch(const std::vector &inputs, + const std::vector & /*workspace*/, + const std::vector &outputs) { + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto output_addr = reinterpret_cast(outputs[0]->addr); + size_t size = IntToSize(inputs[0]->size / sizeof(T)); + + std::vector input_shape = input_shape_; + std::vector output_shape = output_shape_; + size_t block_size = block_size_; + size_t input_dimension = input_shape.size(); + size_t input_strides[3] = {1, 1, 1}; + + for (size_t i = input_dimension - 1; i >= 1; --i) { + for (size_t j = 0; j < i; ++j) { + input_strides[j] *= input_shape[i]; + } + } + + auto task = [&, input_addr, output_addr](size_t start, size_t end) { + std::vector input_pos_array(input_dimension, 0); + for (size_t i = start; i < end; ++i) { + size_t tmp_pos = i; + for (size_t j = 0; j < input_dimension - 1; ++j) { + input_pos_array[j] = tmp_pos / input_strides[j]; + tmp_pos %= input_strides[j]; + } + input_pos_array.back() = tmp_pos; + size_t output_pos = input_pos_array[0]; + output_pos = + (output_pos * output_shape[1]) + + (input_pos_array[1] + + (block_size * (input_pos_array[2] % block_size) + input_pos_array[3] % block_size) * input_shape[1]); + output_pos = (output_pos * output_shape[2]) + (input_pos_array[2] / block_size); + output_pos = (output_pos * output_shape[3]) + (input_pos_array[3] / block_size); + output_addr[output_pos] = input_addr[i]; + } + }; + + CPUKernelUtils::ParallelFor(task, size); + return true; +} + +template +void SpaceToDepthCPUKernel::CheckParam(const CNodePtr &kernel_node) { + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); + if (input_num != 1) { + MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input."; + } + size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); + if (output_num != 1) { + MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output."; + } +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h index 6e12ff85371..20b93f2cce5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h @@ -1,84 +1,84 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ -#include -#include - -#include "backend/kernel_compiler/cpu/cpu_kernel.h" -#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" -namespace mindspore { -namespace kernel { -template -class SpaceToDepthCPUKernel : public CPUKernel { - public: - SpaceToDepthCPUKernel() = default; - ~SpaceToDepthCPUKernel() override = default; - - void InitKernel(const CNodePtr &kernel_node) override; - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - private: - void CheckParam(const CNodePtr &kernel_node); - std::vector input_shape_; - std::vector output_shape_; - size_t block_size_; -}; - -MS_REG_CPU_KERNEL_T( - SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), - SpaceToDepthCPUKernel, float); - -MS_REG_CPU_KERNEL_T( - SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), - SpaceToDepthCPUKernel, float16); - -MS_REG_CPU_KERNEL_T(SpaceToDepth, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), - SpaceToDepthCPUKernel, int8_t); - -MS_REG_CPU_KERNEL_T(SpaceToDepth, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), - SpaceToDepthCPUKernel, int16_t); - -MS_REG_CPU_KERNEL_T(SpaceToDepth, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), - SpaceToDepthCPUKernel, int); - -MS_REG_CPU_KERNEL_T(SpaceToDepth, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), - SpaceToDepthCPUKernel, int64_t); - -MS_REG_CPU_KERNEL_T(SpaceToDepth, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), - SpaceToDepthCPUKernel, uint8_t); - -MS_REG_CPU_KERNEL_T(SpaceToDepth, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16), - SpaceToDepthCPUKernel, uint16_t); - -MS_REG_CPU_KERNEL_T(SpaceToDepth, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32), - SpaceToDepthCPUKernel, uint32_t); - -MS_REG_CPU_KERNEL_T(SpaceToDepth, - KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64), - SpaceToDepthCPUKernel, uint64_t); - -} // namespace kernel -} // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ +#include +#include + +#include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" +namespace mindspore { +namespace kernel { +template +class SpaceToDepthCPUKernel : public CPUKernel { + public: + SpaceToDepthCPUKernel() = default; + ~SpaceToDepthCPUKernel() override = default; + + void InitKernel(const CNodePtr &kernel_node) override; + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + void CheckParam(const CNodePtr &kernel_node); + std::vector input_shape_; + std::vector output_shape_; + size_t block_size_; +}; + +MS_REG_CPU_KERNEL_T( + SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), + SpaceToDepthCPUKernel, float); + +MS_REG_CPU_KERNEL_T( + SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), + SpaceToDepthCPUKernel, float16); + +MS_REG_CPU_KERNEL_T(SpaceToDepth, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), + SpaceToDepthCPUKernel, int8_t); + +MS_REG_CPU_KERNEL_T(SpaceToDepth, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), + SpaceToDepthCPUKernel, int16_t); + +MS_REG_CPU_KERNEL_T(SpaceToDepth, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), + SpaceToDepthCPUKernel, int); + +MS_REG_CPU_KERNEL_T(SpaceToDepth, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), + SpaceToDepthCPUKernel, int64_t); + +MS_REG_CPU_KERNEL_T(SpaceToDepth, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), + SpaceToDepthCPUKernel, uint8_t); + +MS_REG_CPU_KERNEL_T(SpaceToDepth, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16), + SpaceToDepthCPUKernel, uint16_t); + +MS_REG_CPU_KERNEL_T(SpaceToDepth, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32), + SpaceToDepthCPUKernel, uint32_t); + +MS_REG_CPU_KERNEL_T(SpaceToDepth, + KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64), + SpaceToDepthCPUKernel, uint64_t); + +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.cc index 794670793cd..11a22a37e87 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.cc @@ -1,87 +1,87 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include "backend/kernel_compiler/cpu/topk_cpu_kernel.h" -#include "runtime/device/cpu/cpu_device_address.h" - -namespace mindspore { -namespace kernel { -template -void TopKCPUKernel::LaunchKernel(const std::vector &inputs, const std::vector &outputs) { - if (inputs.size() != 2 || outputs.size() != 2) { - MS_LOG(EXCEPTION) << "TopK needs 2 inputs and 2 outputs, but get inputs: " << inputs.size() - << "outputs: " << outputs.size(); - } - if (inputs[0]->size != outer_size_ * inner_size_ * sizeof(T)) { - MS_LOG(EXCEPTION) << "Error input data size!"; - } - if (inputs[1]->size != sizeof(int)) { - MS_LOG(EXCEPTION) << "Input K must be int!"; - } - auto input = reinterpret_cast(inputs[0]->addr); - int k = reinterpret_cast(inputs[1]->addr)[0]; - auto output = reinterpret_cast(outputs[0]->addr); - auto indices = reinterpret_cast(outputs[1]->addr); - if (k < 1) { - MS_LOG(EXCEPTION) << "Input k must > 0!"; - } - size_t k_num = IntToSize(std::min(inner_size_, k)); - if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) { - MS_LOG(EXCEPTION) << "Error output data size!"; - } - for (size_t i = 0; i < outer_size_; ++i) { - std::vector idx(inner_size_); - auto base_input = i * inner_size_; - std::iota(idx.begin(), idx.end(), base_input); - std::stable_sort(idx.begin(), idx.end(), - [&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; }); - auto base_output = i * k_num; - if (!sorted_) { - std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num)); - } - for (size_t j = 0; j < k_num; ++j) { - indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input); - output[base_output + j] = input[idx[j]]; - } - } -} - -void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); - for (size_t i = 0; i < x_shape_.size() - 1; ++i) { - outer_size_ *= x_shape_[i]; - } - inner_size_ = x_shape_[x_shape_.size() - 1]; - sorted_ = AnfAlgo::GetNodeAttr(kernel_node, "sorted"); - dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); -} - -bool TopKCPUKernel::Launch(const std::vector &inputs, const std::vector &, - const std::vector &outputs) { - if (dtype_ == kNumberTypeFloat16) { - LaunchKernel(inputs, outputs); - } else if (dtype_ == kNumberTypeFloat32) { - LaunchKernel(inputs, outputs); - } - return true; -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "backend/kernel_compiler/cpu/topk_cpu_kernel.h" +#include "runtime/device/cpu/cpu_device_address.h" + +namespace mindspore { +namespace kernel { +template +void TopKCPUKernel::LaunchKernel(const std::vector &inputs, const std::vector &outputs) { + if (inputs.size() != 2 || outputs.size() != 2) { + MS_LOG(EXCEPTION) << "TopK needs 2 inputs and 2 outputs, but get inputs: " << inputs.size() + << "outputs: " << outputs.size(); + } + if (inputs[0]->size != outer_size_ * inner_size_ * sizeof(T)) { + MS_LOG(EXCEPTION) << "Error input data size!"; + } + if (inputs[1]->size != sizeof(int)) { + MS_LOG(EXCEPTION) << "Input K must be int!"; + } + auto input = reinterpret_cast(inputs[0]->addr); + int k = reinterpret_cast(inputs[1]->addr)[0]; + auto output = reinterpret_cast(outputs[0]->addr); + auto indices = reinterpret_cast(outputs[1]->addr); + if (k < 1) { + MS_LOG(EXCEPTION) << "Input k must > 0!"; + } + size_t k_num = IntToSize(std::min(inner_size_, k)); + if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) { + MS_LOG(EXCEPTION) << "Error output data size!"; + } + for (size_t i = 0; i < outer_size_; ++i) { + std::vector idx(inner_size_); + auto base_input = i * inner_size_; + std::iota(idx.begin(), idx.end(), base_input); + std::stable_sort(idx.begin(), idx.end(), + [&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; }); + auto base_output = i * k_num; + if (!sorted_) { + std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num)); + } + for (size_t j = 0; j < k_num; ++j) { + indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input); + output[base_output + j] = input[idx[j]]; + } + } +} + +void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + for (size_t i = 0; i < x_shape_.size() - 1; ++i) { + outer_size_ *= x_shape_[i]; + } + inner_size_ = x_shape_[x_shape_.size() - 1]; + sorted_ = AnfAlgo::GetNodeAttr(kernel_node, "sorted"); + dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); +} + +bool TopKCPUKernel::Launch(const std::vector &inputs, const std::vector &, + const std::vector &outputs) { + if (dtype_ == kNumberTypeFloat16) { + LaunchKernel(inputs, outputs); + } else if (dtype_ == kNumberTypeFloat32) { + LaunchKernel(inputs, outputs); + } + return true; +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.h index 157456037bf..72ccb769486 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.h @@ -1,46 +1,46 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ -#include -#include -#include -#include "backend/kernel_compiler/cpu/cpu_kernel.h" -#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" - -namespace mindspore { -namespace kernel { -class TopKCPUKernel : public CPUKernel { - public: - TopKCPUKernel() = default; - ~TopKCPUKernel() override = default; - void InitKernel(const CNodePtr &kernel_node) override; - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - private: - template - void LaunchKernel(const std::vector &inputs, const std::vector &outputs); - size_t outer_size_{1}; - size_t inner_size_{1}; - bool sorted_{false}; - TypeId dtype_{kTypeUnknown}; -}; - -MS_REG_CPU_KERNEL(TopK, KernelAttr(), TopKCPUKernel) -} // namespace kernel -} // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ +#include +#include +#include +#include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" + +namespace mindspore { +namespace kernel { +class TopKCPUKernel : public CPUKernel { + public: + TopKCPUKernel() = default; + ~TopKCPUKernel() override = default; + void InitKernel(const CNodePtr &kernel_node) override; + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + template + void LaunchKernel(const std::vector &inputs, const std::vector &outputs); + size_t outer_size_{1}; + size_t inner_size_{1}; + bool sorted_{false}; + TypeId dtype_{kTypeUnknown}; +}; + +MS_REG_CPU_KERNEL(TopK, KernelAttr(), TopKCPUKernel) +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc index 12e7401585b..4dba82b928b 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc @@ -1,159 +1,159 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h" -#include -#include -#include "runtime/device/cpu/cpu_device_address.h" -#include "common/thread_pool.h" -#include "nnacl/fp32/transpose_fp32.h" -#include "nnacl/int8/transpose_int8.h" -#include "nnacl/errorcode.h" - -namespace mindspore { -namespace kernel { -void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); - auto tmp = AnfAlgo::GetNodeAttr>(kernel_node, "perm"); - axes_ = {tmp.begin(), tmp.end()}; - dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); - if (axes_.size() > MAX_TRANSPOSE_DIM_SIZE) { - MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_TRANSPOSE_DIM_SIZE << "D, but got " - << axes_.size() << "D."; - } - - for (size_t i = 0; i < axes_.size(); ++i) { - transpose_param_.perm_[i] = SizeToInt(axes_[i]); - } - int num_axes = SizeToInt(input_shape_.size()); - transpose_param_.perm_size_ = axes_.size(); - transpose_param_.num_axes_ = num_axes; - transpose_param_.strides_[num_axes - 1] = 1; - transpose_param_.out_strides_[num_axes - 1] = 1; - for (int i = num_axes - 2; i >= 0; i--) { - transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1]; - transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1]; - } - launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel; - launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel; - launch_map_[kNumberTypeInt32] = &TransposeCPUFwdKernel::LaunchKernel; - launch_map_[kNumberTypeInt64] = &TransposeCPUFwdKernel::LaunchKernel; - launch_map_[kNumberTypeUInt8] = &TransposeCPUFwdKernel::LaunchKernel; - launch_map_[kNumberTypeUInt16] = &TransposeCPUFwdKernel::LaunchKernel; - launch_map_[kNumberTypeUInt32] = &TransposeCPUFwdKernel::LaunchKernel; - launch_map_[kNumberTypeUInt64] = &TransposeCPUFwdKernel::LaunchKernel; - launch_map_[kNumberTypeFloat32] = &TransposeCPUFwdKernel::LaunchKernel; - launch_map_[kNumberTypeBool] = &TransposeCPUFwdKernel::LaunchKernel; - - auto iter = launch_map_.find(dtype_); - if (iter != launch_map_.end()) { - launch_func_ = iter->second; - } else { - MS_LOG(EXCEPTION) << "Input data type: " << dtype_ << "is not supported for Transpose kernel on CPU."; - } -} - -bool TransposeCPUFwdKernel::Launch(const std::vector &inputs, - const std::vector &, - const std::vector &outputs) { - launch_func_(this, inputs, outputs); - return true; -} - -template -void TransposeCPUFwdKernel::LaunchKernel(const std::vector &inputs, - const std::vector &outputs) { - const auto *input_addr = reinterpret_cast(inputs[0]->addr); - auto *output_addr = reinterpret_cast(outputs[0]->addr); - transpose_param_.data_num_ = inputs[0]->size / sizeof(T); - int output_shape[SizeToInt(output_shape_.size())]; - for (size_t i = 0; i < output_shape_.size(); ++i) { - output_shape[i] = SizeToInt(output_shape_[i]); - } - size_t data_count = (inputs[0]->size) / sizeof(T); - if (axes_.size() <= DIMENSION_6D && data_count < MAX_TRANSPOSE_SERIAL_SIZE) { - int res = NNACL_ERR; - if constexpr (std::is_same_v) { - res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_); - } else if constexpr (std::is_same_v) { - res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_); - } else if constexpr (std::is_same_v) { - res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_); - } else if constexpr (std::is_same_v) { - res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_); - } else if constexpr (std::is_same_v) { - res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_); - } else if constexpr (std::is_same_v) { - res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_); - } else if constexpr (std::is_same_v) { - res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_); - } else if constexpr (std::is_same_v) { - res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_); - } else if constexpr (std::is_same_v) { - res = DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_); - } else if constexpr (std::is_same_v) { - res = DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_); - } - if (res != NNACL_OK) { - MS_LOG(ERROR) << "Transpose run failed"; - } - } else { - ParallelRun(input_addr, output_addr, output_shape, data_count); - } -} - -template -void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) { - auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); - const float block_size = 128.0; - size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num; - std::vector tasks; - std::function TransposeDims; - - if constexpr (std::is_same_v) { - TransposeDims = &TransposeDimsInt8; - } else if constexpr (std::is_same_v) { - TransposeDims = &TransposeDimsInt16; - } else if constexpr (std::is_same_v) { - TransposeDims = &TransposeDimsInt32; - } else if constexpr (std::is_same_v) { - TransposeDims = &TransposeDimsInt64; - } else if constexpr (std::is_same_v) { - TransposeDims = &TransposeDimsUInt8; - } else if constexpr (std::is_same_v) { - TransposeDims = &TransposeDimsUInt16; - } else if constexpr (std::is_same_v) { - TransposeDims = &TransposeDimsUInt32; - } else if constexpr (std::is_same_v) { - TransposeDims = &TransposeDimsUInt64; - } else if constexpr (std::is_same_v) { - TransposeDims = &TransposeDimsFp32; - } else if constexpr (std::is_same_v) { - TransposeDims = &TransposeDimsBool; - } - for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) { - auto task = [&, task_id, thread_num]() { - TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num)); - return common::SUCCESS; - }; - tasks.emplace_back(task); - } - common::ThreadPool::GetInstance().SyncRun(tasks); -} -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h" +#include +#include +#include "runtime/device/cpu/cpu_device_address.h" +#include "common/thread_pool.h" +#include "nnacl/fp32/transpose_fp32.h" +#include "nnacl/int8/transpose_int8.h" +#include "nnacl/errorcode.h" + +namespace mindspore { +namespace kernel { +void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); + auto tmp = AnfAlgo::GetNodeAttr>(kernel_node, "perm"); + axes_ = {tmp.begin(), tmp.end()}; + dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); + if (axes_.size() > MAX_TRANSPOSE_DIM_SIZE) { + MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_TRANSPOSE_DIM_SIZE << "D, but got " + << axes_.size() << "D."; + } + + for (size_t i = 0; i < axes_.size(); ++i) { + transpose_param_.perm_[i] = SizeToInt(axes_[i]); + } + int num_axes = SizeToInt(input_shape_.size()); + transpose_param_.perm_size_ = axes_.size(); + transpose_param_.num_axes_ = num_axes; + transpose_param_.strides_[num_axes - 1] = 1; + transpose_param_.out_strides_[num_axes - 1] = 1; + for (int i = num_axes - 2; i >= 0; i--) { + transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1]; + transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1]; + } + launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel; + launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel; + launch_map_[kNumberTypeInt32] = &TransposeCPUFwdKernel::LaunchKernel; + launch_map_[kNumberTypeInt64] = &TransposeCPUFwdKernel::LaunchKernel; + launch_map_[kNumberTypeUInt8] = &TransposeCPUFwdKernel::LaunchKernel; + launch_map_[kNumberTypeUInt16] = &TransposeCPUFwdKernel::LaunchKernel; + launch_map_[kNumberTypeUInt32] = &TransposeCPUFwdKernel::LaunchKernel; + launch_map_[kNumberTypeUInt64] = &TransposeCPUFwdKernel::LaunchKernel; + launch_map_[kNumberTypeFloat32] = &TransposeCPUFwdKernel::LaunchKernel; + launch_map_[kNumberTypeBool] = &TransposeCPUFwdKernel::LaunchKernel; + + auto iter = launch_map_.find(dtype_); + if (iter != launch_map_.end()) { + launch_func_ = iter->second; + } else { + MS_LOG(EXCEPTION) << "Input data type: " << dtype_ << "is not supported for Transpose kernel on CPU."; + } +} + +bool TransposeCPUFwdKernel::Launch(const std::vector &inputs, + const std::vector &, + const std::vector &outputs) { + launch_func_(this, inputs, outputs); + return true; +} + +template +void TransposeCPUFwdKernel::LaunchKernel(const std::vector &inputs, + const std::vector &outputs) { + const auto *input_addr = reinterpret_cast(inputs[0]->addr); + auto *output_addr = reinterpret_cast(outputs[0]->addr); + transpose_param_.data_num_ = inputs[0]->size / sizeof(T); + int output_shape[SizeToInt(output_shape_.size())]; + for (size_t i = 0; i < output_shape_.size(); ++i) { + output_shape[i] = SizeToInt(output_shape_[i]); + } + size_t data_count = (inputs[0]->size) / sizeof(T); + if (axes_.size() <= DIMENSION_6D && data_count < MAX_TRANSPOSE_SERIAL_SIZE) { + int res = NNACL_ERR; + if constexpr (std::is_same_v) { + res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_); + } else if constexpr (std::is_same_v) { + res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_); + } else if constexpr (std::is_same_v) { + res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_); + } else if constexpr (std::is_same_v) { + res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_); + } else if constexpr (std::is_same_v) { + res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_); + } else if constexpr (std::is_same_v) { + res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_); + } else if constexpr (std::is_same_v) { + res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_); + } else if constexpr (std::is_same_v) { + res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_); + } else if constexpr (std::is_same_v) { + res = DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_); + } else if constexpr (std::is_same_v) { + res = DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_); + } + if (res != NNACL_OK) { + MS_LOG(ERROR) << "Transpose run failed"; + } + } else { + ParallelRun(input_addr, output_addr, output_shape, data_count); + } +} + +template +void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) { + auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); + const float block_size = 128.0; + size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num; + std::vector tasks; + std::function TransposeDims; + + if constexpr (std::is_same_v) { + TransposeDims = &TransposeDimsInt8; + } else if constexpr (std::is_same_v) { + TransposeDims = &TransposeDimsInt16; + } else if constexpr (std::is_same_v) { + TransposeDims = &TransposeDimsInt32; + } else if constexpr (std::is_same_v) { + TransposeDims = &TransposeDimsInt64; + } else if constexpr (std::is_same_v) { + TransposeDims = &TransposeDimsUInt8; + } else if constexpr (std::is_same_v) { + TransposeDims = &TransposeDimsUInt16; + } else if constexpr (std::is_same_v) { + TransposeDims = &TransposeDimsUInt32; + } else if constexpr (std::is_same_v) { + TransposeDims = &TransposeDimsUInt64; + } else if constexpr (std::is_same_v) { + TransposeDims = &TransposeDimsFp32; + } else if constexpr (std::is_same_v) { + TransposeDims = &TransposeDimsBool; + } + for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) { + auto task = [&, task_id, thread_num]() { + TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num)); + return common::SUCCESS; + }; + tasks.emplace_back(task); + } + common::ThreadPool::GetInstance().SyncRun(tasks); +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h index b5413865549..7dc7f9a1265 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h @@ -1,58 +1,58 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ -#include -#include -#include -#include -#include "backend/kernel_compiler/cpu/cpu_kernel.h" -#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" -#include "nnacl/base/transpose_base.h" - -namespace mindspore { -namespace kernel { -class TransposeCPUFwdKernel : public CPUKernel { - public: - TransposeCPUFwdKernel() = default; - ~TransposeCPUFwdKernel() override = default; - - void InitKernel(const CNodePtr &kernel_node) override; - - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs) override; - - private: - template - void LaunchKernel(const std::vector &inputs, const std::vector &outputs); - - template - void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count); - - TransposeParameter transpose_param_; - std::vector input_shape_; - std::vector output_shape_; - std::vector axes_; - TypeId dtype_{kTypeUnknown}; - using TypeKernel = - std::function &, const std::vector &)>; - std::unordered_map launch_map_; - TypeKernel launch_func_; -}; -MS_REG_CPU_KERNEL(Transpose, KernelAttr(), TransposeCPUFwdKernel); -} // namespace kernel -} // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ +#include +#include +#include +#include +#include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" +#include "nnacl/base/transpose_base.h" + +namespace mindspore { +namespace kernel { +class TransposeCPUFwdKernel : public CPUKernel { + public: + TransposeCPUFwdKernel() = default; + ~TransposeCPUFwdKernel() override = default; + + void InitKernel(const CNodePtr &kernel_node) override; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + template + void LaunchKernel(const std::vector &inputs, const std::vector &outputs); + + template + void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count); + + TransposeParameter transpose_param_; + std::vector input_shape_; + std::vector output_shape_; + std::vector axes_; + TypeId dtype_{kTypeUnknown}; + using TypeKernel = + std::function &, const std::vector &)>; + std::unordered_map launch_map_; + TypeKernel launch_func_; +}; +MS_REG_CPU_KERNEL(Transpose, KernelAttr(), TransposeCPUFwdKernel); +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_