change from CRLF to LF

This commit is contained in:
zhujingxuan 2021-07-13 16:50:33 +08:00
parent 85e20508eb
commit b3d4399d32
27 changed files with 3102 additions and 3102 deletions

View File

@ -1,116 +1,116 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h"
namespace mindspore {
namespace kernel {
constexpr size_t kBceInputNumWithWeight = 3;
template <typename T>
void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) {
if (input_size % 2 == 1) {
tmp_loss[0] += tmp_loss[input_size - 1];
}
for (int stride = input_size / 2; stride > 0; stride = stride / 2) {
for (int i = 0; i < stride; i++) {
tmp_loss[i] += tmp_loss[i + stride];
}
if (stride > 2 && stride % 2 == 1) {
tmp_loss[0] += tmp_loss[stride - 1];
}
}
loss[0] += tmp_loss[0];
if (reduction == 1) {
loss[0] /= static_cast<T>(input_size);
}
}
template <typename T>
void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
T *weight = nullptr;
if (weight_defined_) {
weight = reinterpret_cast<T *>(inputs[2]->addr);
}
T *loss = reinterpret_cast<T *>(outputs[0]->addr);
std::vector<T> tmp_loss(input_size_);
T epsilon = static_cast<T>(1e-12);
T one = static_cast<T>(1);
if (reduction_ == 0 && weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T value =
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
loss[i] = value;
}
} else if (reduction_ == 0 && (!weight_defined_)) {
for (size_t i = 0; i < input_size_; i++) {
T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
loss[i] = value;
}
} else if ((reduction_ != 0) && weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T value =
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
tmp_loss[i] = value;
}
} else {
for (size_t i = 0; i < input_size_; i++) {
T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
tmp_loss[i] = value;
}
}
if (reduction_ != 0) {
LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data());
}
}
bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
if (input_size_ > 0) {
if (dtype_ == kNumberTypeFloat32) {
Launchkernel<float>(inputs, workspace, outputs);
} else if (dtype_ == kNumberTypeFloat16) {
Launchkernel<float16>(inputs, workspace, outputs);
}
}
return true;
}
void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) {
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
if (reduction == "none") {
reduction_ = 0;
} else if (reduction == "sum") {
reduction_ = 2;
}
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
weight_defined_ = (input_num == kBceInputNumWithWeight);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h"
namespace mindspore {
namespace kernel {
constexpr size_t kBceInputNumWithWeight = 3;
template <typename T>
void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) {
if (input_size % 2 == 1) {
tmp_loss[0] += tmp_loss[input_size - 1];
}
for (int stride = input_size / 2; stride > 0; stride = stride / 2) {
for (int i = 0; i < stride; i++) {
tmp_loss[i] += tmp_loss[i + stride];
}
if (stride > 2 && stride % 2 == 1) {
tmp_loss[0] += tmp_loss[stride - 1];
}
}
loss[0] += tmp_loss[0];
if (reduction == 1) {
loss[0] /= static_cast<T>(input_size);
}
}
template <typename T>
void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
T *weight = nullptr;
if (weight_defined_) {
weight = reinterpret_cast<T *>(inputs[2]->addr);
}
T *loss = reinterpret_cast<T *>(outputs[0]->addr);
std::vector<T> tmp_loss(input_size_);
T epsilon = static_cast<T>(1e-12);
T one = static_cast<T>(1);
if (reduction_ == 0 && weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T value =
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
loss[i] = value;
}
} else if (reduction_ == 0 && (!weight_defined_)) {
for (size_t i = 0; i < input_size_; i++) {
T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
loss[i] = value;
}
} else if ((reduction_ != 0) && weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T value =
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
tmp_loss[i] = value;
}
} else {
for (size_t i = 0; i < input_size_; i++) {
T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
tmp_loss[i] = value;
}
}
if (reduction_ != 0) {
LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data());
}
}
bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
if (input_size_ > 0) {
if (dtype_ == kNumberTypeFloat32) {
Launchkernel<float>(inputs, workspace, outputs);
} else if (dtype_ == kNumberTypeFloat16) {
Launchkernel<float16>(inputs, workspace, outputs);
}
}
return true;
}
void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) {
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
if (reduction == "none") {
reduction_ = 0;
} else if (reduction == "sum") {
reduction_ = 2;
}
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
weight_defined_ = (input_num == kBceInputNumWithWeight);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,71 +1,71 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
#include <vector>
#include <string>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class BinaryCrossEntropyCpuKernel : public CPUKernel {
public:
BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
~BinaryCrossEntropyCpuKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
template <typename T>
void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss);
template <typename T>
void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs);
TypeId dtype_{kTypeUnknown};
size_t input_size_;
int reduction_;
bool weight_defined_; // true: there are 3 inputs, false: there are 2 inputs(no [weight])
};
MS_REG_CPU_KERNEL(BinaryCrossEntropy,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
BinaryCrossEntropyCpuKernel);
MS_REG_CPU_KERNEL(BinaryCrossEntropy,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
BinaryCrossEntropyCpuKernel);
MS_REG_CPU_KERNEL(
BinaryCrossEntropy,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BinaryCrossEntropyCpuKernel);
MS_REG_CPU_KERNEL(
BinaryCrossEntropy,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BinaryCrossEntropyCpuKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
#include <vector>
#include <string>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class BinaryCrossEntropyCpuKernel : public CPUKernel {
public:
BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
~BinaryCrossEntropyCpuKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
template <typename T>
void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss);
template <typename T>
void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs);
TypeId dtype_{kTypeUnknown};
size_t input_size_;
int reduction_;
bool weight_defined_; // true: there are 3 inputs, false: there are 2 inputs(no [weight])
};
MS_REG_CPU_KERNEL(BinaryCrossEntropy,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
BinaryCrossEntropyCpuKernel);
MS_REG_CPU_KERNEL(BinaryCrossEntropy,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
BinaryCrossEntropyCpuKernel);
MS_REG_CPU_KERNEL(
BinaryCrossEntropy,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BinaryCrossEntropyCpuKernel);
MS_REG_CPU_KERNEL(
BinaryCrossEntropy,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BinaryCrossEntropyCpuKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H

View File

@ -1,102 +1,102 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h"
namespace mindspore {
namespace kernel {
constexpr size_t kBceGradInputNumWithWeight = 4;
template <typename T>
void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
T *dloss = reinterpret_cast<T *>(inputs[2]->addr);
T *weight = nullptr;
if (weight_defined_) {
weight = reinterpret_cast<T *>(inputs[3]->addr);
}
T *dx = reinterpret_cast<T *>(outputs[0]->addr);
T epsilon = static_cast<T>(1e-12);
T one = static_cast<T>(1);
if (reduction_ == 0) {
if (weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
dx[i] = value * dloss[i];
}
} else {
for (size_t i = 0; i < input_size_; i++) {
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
T value = (input_x[i] - input_y[i]) / denominator;
dx[i] = value * dloss[i];
}
}
} else {
T dloss1 = dloss[0];
if (reduction_ == 1) {
dloss1 = dloss[0] / static_cast<T>(input_size_);
}
if (weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
dx[i] = value * dloss1;
}
} else {
for (size_t i = 0; i < input_size_; i++) {
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
T value = (input_x[i] - input_y[i]) / denominator;
dx[i] = value * dloss1;
}
}
}
}
bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
if (input_size_ > 0) {
if (dtype_ == kNumberTypeFloat32) {
Launchkernel<float>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat16) {
Launchkernel<float16>(inputs, outputs);
}
}
return true;
}
void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) {
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
if (reduction == "none") {
reduction_ = 0;
} else if (reduction == "sum") {
reduction_ = 2;
}
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
weight_defined_ = (input_num == kBceGradInputNumWithWeight);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h"
namespace mindspore {
namespace kernel {
constexpr size_t kBceGradInputNumWithWeight = 4;
template <typename T>
void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
T *dloss = reinterpret_cast<T *>(inputs[2]->addr);
T *weight = nullptr;
if (weight_defined_) {
weight = reinterpret_cast<T *>(inputs[3]->addr);
}
T *dx = reinterpret_cast<T *>(outputs[0]->addr);
T epsilon = static_cast<T>(1e-12);
T one = static_cast<T>(1);
if (reduction_ == 0) {
if (weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
dx[i] = value * dloss[i];
}
} else {
for (size_t i = 0; i < input_size_; i++) {
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
T value = (input_x[i] - input_y[i]) / denominator;
dx[i] = value * dloss[i];
}
}
} else {
T dloss1 = dloss[0];
if (reduction_ == 1) {
dloss1 = dloss[0] / static_cast<T>(input_size_);
}
if (weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
dx[i] = value * dloss1;
}
} else {
for (size_t i = 0; i < input_size_; i++) {
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
T value = (input_x[i] - input_y[i]) / denominator;
dx[i] = value * dloss1;
}
}
}
}
bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
if (input_size_ > 0) {
if (dtype_ == kNumberTypeFloat32) {
Launchkernel<float>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat16) {
Launchkernel<float16>(inputs, outputs);
}
}
return true;
}
void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) {
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
if (reduction == "none") {
reduction_ = 0;
} else if (reduction == "sum") {
reduction_ = 2;
}
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
weight_defined_ = (input_num == kBceGradInputNumWithWeight);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,76 +1,76 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
#include <vector>
#include <string>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class BinaryCrossEntropyGradCpuKernel : public CPUKernel {
public:
BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
~BinaryCrossEntropyGradCpuKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
template <typename T>
void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
TypeId dtype_{kTypeUnknown};
size_t input_size_;
int reduction_;
bool weight_defined_; // true: there are 4 inputs, false: there are 3 inputs(no [weight])
};
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
BinaryCrossEntropyGradCpuKernel);
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
BinaryCrossEntropyGradCpuKernel);
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
BinaryCrossEntropyGradCpuKernel);
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
BinaryCrossEntropyGradCpuKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
#include <vector>
#include <string>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class BinaryCrossEntropyGradCpuKernel : public CPUKernel {
public:
BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
~BinaryCrossEntropyGradCpuKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
template <typename T>
void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
TypeId dtype_{kTypeUnknown};
size_t input_size_;
int reduction_;
bool weight_defined_; // true: there are 4 inputs, false: there are 3 inputs(no [weight])
};
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
BinaryCrossEntropyGradCpuKernel);
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
BinaryCrossEntropyGradCpuKernel);
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
BinaryCrossEntropyGradCpuKernel);
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
BinaryCrossEntropyGradCpuKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H

View File

@ -1,271 +1,271 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include <algorithm>
#include <utility>
#include "common/thread_pool.h"
namespace mindspore {
namespace kernel {
void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
for (size_t input_index = 0; input_index < input_num; ++input_index) {
TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index);
size_t type_size = GetTypeByte(TypeIdToType(type_id));
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index);
size_t tensor_size =
shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
tensor_size = std::max(tensor_size, type_size);
input_size_list_.emplace_back(tensor_size);
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
for (size_t output_index = 0; output_index < output_num; ++output_index) {
TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index);
size_t type_size = GetTypeByte(TypeIdToType(type_id));
std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index);
size_t tensor_size =
shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
tensor_size = std::max(tensor_size, type_size);
output_size_list_.emplace_back(tensor_size);
}
}
void CPUKernel::Init(const CNodePtr &kernel_node) {
InitKernel(kernel_node);
InitInputOutputSize(kernel_node);
}
void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
auto len = shape->size();
if (len < 4) {
for (size_t i = 0; i < 4 - len; ++i) {
shape->insert(shape->begin(), 1);
}
}
}
size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2,
size_t dim3) {
size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3;
return offset;
}
size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) {
if (axis < 0) {
axis = axis + SizeToInt(shape.size());
}
size_t result = 1;
for (int j = 3; j > axis; --j) {
result *= shape[j];
}
return result;
}
void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
size_t accumulation = 1;
element_num->emplace_back(1);
for (size_t i = shape.size() - 1; i > 0; --i) {
accumulation *= shape[i];
element_num->emplace_back(accumulation);
}
std::reverse(element_num->begin(), element_num->end());
}
void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) {
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
const float block_size = 128.0;
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
std::vector<common::Task> tasks;
size_t start = 0;
size_t once_compute_size = (count + thread_num - 1) / thread_num;
while (start < count) {
size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
auto block = [&, start, end]() {
task(start, end);
return common::SUCCESS;
};
tasks.emplace_back(block);
start += once_compute_size;
}
common::ThreadPool::GetInstance().SyncRun(tasks);
}
std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) {
if (axis < 0) {
axis = axis + SizeToInt(shape.size());
}
size_t dim_row = 1;
size_t dim_col = 1;
std::vector<size_t> flat_shape;
for (size_t i = 0; i < shape.size(); ++i) {
if (SizeToInt(i) < axis) {
dim_row *= shape[i];
} else {
dim_col *= shape[i];
}
}
flat_shape.push_back(dim_row);
flat_shape.push_back(dim_col);
return flat_shape;
}
BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
std::vector<size_t> output_shape)
: input_shape_a_(std::move(input_shape_a)),
input_shape_b_(std::move(input_shape_b)),
output_shape_(std::move(output_shape)) {
output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator
BroadcastShape();
// Allocate strides memory
input_strides_a_.resize(output_dimension_);
input_strides_b_.resize(output_dimension_);
input_back_strides_a_.resize(output_dimension_);
input_back_strides_b_.resize(output_dimension_);
coordinates_.resize(output_dimension_);
InitStrides();
}
void BroadcastIterator::SetPos(size_t pos) {
for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
coordinates_[i] = pos % output_shape_[i];
input_pos_[0] += coordinates_[i] * input_strides_a_[i];
input_pos_[1] += coordinates_[i] * input_strides_b_[i];
pos /= output_shape_[i];
}
}
void BroadcastIterator::GenNextPos() {
// Calculate output next coordinate
for (int i = output_dimension_ - 1; i >= 0; --i) {
if (coordinates_[i] + 1 == output_shape_[i]) {
coordinates_[i] = 0;
input_pos_[0] -= input_back_strides_a_[i];
input_pos_[1] -= input_back_strides_b_[i];
} else {
++coordinates_[i];
input_pos_[0] += input_strides_a_[i];
input_pos_[1] += input_strides_b_[i];
break;
}
}
}
void BroadcastIterator::BroadcastShape() {
int input_dimension_a = input_shape_a_.size();
if (input_dimension_a < output_dimension_) {
input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1);
}
int input_dimension_b = input_shape_b_.size();
if (input_dimension_b < output_dimension_) {
input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1);
}
}
void BroadcastIterator::InitStrides() {
input_strides_a_[output_dimension_ - 1] = 1;
input_strides_b_[output_dimension_ - 1] = 1;
for (int i = output_dimension_ - 2; i >= 0; --i) {
input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
}
// Update strides for broadcast
// While the axis value is 1, the stride is 0
std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(),
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(),
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
}
TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,
const std::vector<size_t> &input_shape)
: shape_(std::move(output_shape)), axes_(std::move(axes)) {
// Calculate strides
dimension_ = shape_.size();
std::vector<uint32_t> strides(dimension_, 1);
for (int i = dimension_ - 2; i >= 0; --i) {
strides[i] = input_shape[i + 1] * strides[i + 1];
}
// Swap shape ans strides and calculate back strides
strides_.resize(dimension_);
back_strides_.resize(dimension_);
for (int i = dimension_ - 1; i >= 0; --i) {
strides_[i] = strides[axes_[i]];
back_strides_[i] = (shape_[i] - 1) * strides_[i];
}
// Calculate coordinate by pos
coordinates_.resize(dimension_);
}
void TransposeIterator::SetPos(size_t pos) {
for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
coordinates_[i] = pos % shape_[i];
pos_ += coordinates_[i] * strides_[i];
pos /= shape_[i];
}
}
void TransposeIterator::GenNextPos() {
for (int i = dimension_ - 1; i >= 0; --i) {
if (coordinates_[i] + 1 == shape_[i]) {
coordinates_[i] = 0;
pos_ -= back_strides_[i];
} else {
coordinates_[i]++;
pos_ += strides_[i];
break;
}
}
}
std::vector<size_t> CPUKernelUtils::GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y) {
size_t x_len = x.size();
size_t y_len = y.size();
size_t length = x_len < y_len ? x_len : y_len;
std::vector<size_t> broadcast_shape;
std::vector<size_t> broadcast_shape_back;
for (int i = -length; i < 0; ++i) {
if (x[x_len + i] == 1) {
broadcast_shape_back.push_back(y[y_len + i]);
} else if (y[y_len + i] == 1) {
broadcast_shape_back.push_back(x[x_len + i]);
} else if (x[x_len + i] == y[y_len + i]) {
broadcast_shape_back.push_back(x[x_len + i]);
}
}
if (length == x_len) {
for (size_t i = 0; i < y_len - length; ++i) {
broadcast_shape.push_back(y[i]);
}
} else {
for (size_t i = 0; i < x_len - length; ++i) {
broadcast_shape.push_back(x[i]);
}
}
for (size_t i = 0; i < length; ++i) {
broadcast_shape.push_back(broadcast_shape_back[i]);
}
return broadcast_shape;
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include <algorithm>
#include <utility>
#include "common/thread_pool.h"
namespace mindspore {
namespace kernel {
void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
for (size_t input_index = 0; input_index < input_num; ++input_index) {
TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index);
size_t type_size = GetTypeByte(TypeIdToType(type_id));
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index);
size_t tensor_size =
shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
tensor_size = std::max(tensor_size, type_size);
input_size_list_.emplace_back(tensor_size);
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
for (size_t output_index = 0; output_index < output_num; ++output_index) {
TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index);
size_t type_size = GetTypeByte(TypeIdToType(type_id));
std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index);
size_t tensor_size =
shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
tensor_size = std::max(tensor_size, type_size);
output_size_list_.emplace_back(tensor_size);
}
}
void CPUKernel::Init(const CNodePtr &kernel_node) {
InitKernel(kernel_node);
InitInputOutputSize(kernel_node);
}
void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
auto len = shape->size();
if (len < 4) {
for (size_t i = 0; i < 4 - len; ++i) {
shape->insert(shape->begin(), 1);
}
}
}
size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2,
size_t dim3) {
size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3;
return offset;
}
size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) {
if (axis < 0) {
axis = axis + SizeToInt(shape.size());
}
size_t result = 1;
for (int j = 3; j > axis; --j) {
result *= shape[j];
}
return result;
}
void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
size_t accumulation = 1;
element_num->emplace_back(1);
for (size_t i = shape.size() - 1; i > 0; --i) {
accumulation *= shape[i];
element_num->emplace_back(accumulation);
}
std::reverse(element_num->begin(), element_num->end());
}
void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) {
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
const float block_size = 128.0;
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
std::vector<common::Task> tasks;
size_t start = 0;
size_t once_compute_size = (count + thread_num - 1) / thread_num;
while (start < count) {
size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
auto block = [&, start, end]() {
task(start, end);
return common::SUCCESS;
};
tasks.emplace_back(block);
start += once_compute_size;
}
common::ThreadPool::GetInstance().SyncRun(tasks);
}
std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) {
if (axis < 0) {
axis = axis + SizeToInt(shape.size());
}
size_t dim_row = 1;
size_t dim_col = 1;
std::vector<size_t> flat_shape;
for (size_t i = 0; i < shape.size(); ++i) {
if (SizeToInt(i) < axis) {
dim_row *= shape[i];
} else {
dim_col *= shape[i];
}
}
flat_shape.push_back(dim_row);
flat_shape.push_back(dim_col);
return flat_shape;
}
BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
std::vector<size_t> output_shape)
: input_shape_a_(std::move(input_shape_a)),
input_shape_b_(std::move(input_shape_b)),
output_shape_(std::move(output_shape)) {
output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator
BroadcastShape();
// Allocate strides memory
input_strides_a_.resize(output_dimension_);
input_strides_b_.resize(output_dimension_);
input_back_strides_a_.resize(output_dimension_);
input_back_strides_b_.resize(output_dimension_);
coordinates_.resize(output_dimension_);
InitStrides();
}
void BroadcastIterator::SetPos(size_t pos) {
for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
coordinates_[i] = pos % output_shape_[i];
input_pos_[0] += coordinates_[i] * input_strides_a_[i];
input_pos_[1] += coordinates_[i] * input_strides_b_[i];
pos /= output_shape_[i];
}
}
void BroadcastIterator::GenNextPos() {
// Calculate output next coordinate
for (int i = output_dimension_ - 1; i >= 0; --i) {
if (coordinates_[i] + 1 == output_shape_[i]) {
coordinates_[i] = 0;
input_pos_[0] -= input_back_strides_a_[i];
input_pos_[1] -= input_back_strides_b_[i];
} else {
++coordinates_[i];
input_pos_[0] += input_strides_a_[i];
input_pos_[1] += input_strides_b_[i];
break;
}
}
}
void BroadcastIterator::BroadcastShape() {
int input_dimension_a = input_shape_a_.size();
if (input_dimension_a < output_dimension_) {
input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1);
}
int input_dimension_b = input_shape_b_.size();
if (input_dimension_b < output_dimension_) {
input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1);
}
}
void BroadcastIterator::InitStrides() {
input_strides_a_[output_dimension_ - 1] = 1;
input_strides_b_[output_dimension_ - 1] = 1;
for (int i = output_dimension_ - 2; i >= 0; --i) {
input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
}
// Update strides for broadcast
// While the axis value is 1, the stride is 0
std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(),
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(),
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
}
TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,
const std::vector<size_t> &input_shape)
: shape_(std::move(output_shape)), axes_(std::move(axes)) {
// Calculate strides
dimension_ = shape_.size();
std::vector<uint32_t> strides(dimension_, 1);
for (int i = dimension_ - 2; i >= 0; --i) {
strides[i] = input_shape[i + 1] * strides[i + 1];
}
// Swap shape ans strides and calculate back strides
strides_.resize(dimension_);
back_strides_.resize(dimension_);
for (int i = dimension_ - 1; i >= 0; --i) {
strides_[i] = strides[axes_[i]];
back_strides_[i] = (shape_[i] - 1) * strides_[i];
}
// Calculate coordinate by pos
coordinates_.resize(dimension_);
}
void TransposeIterator::SetPos(size_t pos) {
for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
coordinates_[i] = pos % shape_[i];
pos_ += coordinates_[i] * strides_[i];
pos /= shape_[i];
}
}
void TransposeIterator::GenNextPos() {
for (int i = dimension_ - 1; i >= 0; --i) {
if (coordinates_[i] + 1 == shape_[i]) {
coordinates_[i] = 0;
pos_ -= back_strides_[i];
} else {
coordinates_[i]++;
pos_ += strides_[i];
break;
}
}
}
std::vector<size_t> CPUKernelUtils::GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y) {
size_t x_len = x.size();
size_t y_len = y.size();
size_t length = x_len < y_len ? x_len : y_len;
std::vector<size_t> broadcast_shape;
std::vector<size_t> broadcast_shape_back;
for (int i = -length; i < 0; ++i) {
if (x[x_len + i] == 1) {
broadcast_shape_back.push_back(y[y_len + i]);
} else if (y[y_len + i] == 1) {
broadcast_shape_back.push_back(x[x_len + i]);
} else if (x[x_len + i] == y[y_len + i]) {
broadcast_shape_back.push_back(x[x_len + i]);
}
}
if (length == x_len) {
for (size_t i = 0; i < y_len - length; ++i) {
broadcast_shape.push_back(y[i]);
}
} else {
for (size_t i = 0; i < x_len - length; ++i) {
broadcast_shape.push_back(x[i]);
}
}
for (size_t i = 0; i < length; ++i) {
broadcast_shape.push_back(broadcast_shape_back[i]);
}
return broadcast_shape;
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,205 +1,205 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
#include <functional>
#include <memory>
#include <numeric>
#include <string>
#include <thread>
#include <vector>
#include "backend/kernel_compiler/kernel.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "backend/kernel_compiler/common_utils.h"
#include "ir/anf.h"
using mindspore::kernel::Address;
using mindspore::kernel::AddressPtr;
using CTask = std::function<void(size_t, size_t)>;
namespace mindspore {
namespace kernel {
const char KERNEL_SIZE[] = "kernel_size";
const char STRIDE[] = "stride";
const char STRIDES[] = "strides";
const char DILATION[] = "dilation";
const char DILATIONS[] = "dilations";
const char FORMAT[] = "format";
const char PAD[] = "pad";
const char PAD_LIST[] = "pad_list";
const char PAD_MODE[] = "pad_mode";
const char PAD_MODE_LOWER_SAME[] = "same";
const char PAD_MODE_LOWER_VALID[] = "valid";
const char PAD_MODE_UPPER_SAME[] = "SAME";
const char PAD_MODE_UPPER_VALID[] = "VALID";
const char TRANSPOSE_A[] = "transpose_a";
const char TRANSPOSE_B[] = "transpose_b";
const char IS_GRAD[] = "is_grad";
const char TRANSPOSE_NO = 'N';
const char TRANSPOSE_YES = 'T';
const char AXIS[] = "axis";
const char DIM[] = "dim";
const char BEGIN[] = "begin";
const char END[] = "end";
const char SIZE[] = "size";
const char USE_NESTEROV[] = "use_nesterov";
const char GROUP[] = "group";
const char START[] = "start";
const char LIMIT[] = "limit";
const char DELTA[] = "delta";
const char SORTED[] = "sorted";
const char ADJ_ST[] = "adjoint_st";
const char ADJ_dT[] = "adjoint_dt";
enum OperateType {
ADD = 0,
SUB,
MUL,
DIV,
SQUARE,
SQRT,
POW,
REALDIV,
FLOORDIV,
MOD,
FLOORMOD,
NEG,
LESS,
ASSIGNADD,
RELUGRAD,
RELU6GRAD,
ABSGRAD,
TANHGRAD,
SQRTGRAD,
SIGMOIDGRAD,
ONESLIKE,
ZEROSLIKE,
SIGN,
EQUAL,
NOTEQUAL,
LESSEQUAL,
LOGICALAND,
LOGICALOR,
LOGICALNOT,
FLOOR,
SQUAREDDIFFERENCE,
GREATER,
GREATEREQUAL,
RECIPROCAL,
GELU,
GELUGRAD,
ASIN,
ACOS,
ATAN,
ASINGRAD,
ACOSGRAD,
ATANGRAD,
SIN,
COS,
TAN,
SINH,
COSH,
ASINH,
ACOSH,
ATANH,
ASINHGRAD,
ACOSHGRAD,
ATAN2,
RINT,
ROUND,
IDENTITY,
};
class CPUKernel : public kernel::KernelMod {
public:
CPUKernel() = default;
~CPUKernel() override = default;
virtual void Init(const CNodePtr &kernel_node);
virtual void InitKernel(const CNodePtr &kernel_node) = 0;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override {
return Launch(inputs, workspace, outputs);
};
virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) = 0;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
protected:
virtual void InitInputOutputSize(const CNodePtr &kernel_node);
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
};
class CPUKernelUtils {
public:
static void ExpandDimsTo4(std::vector<size_t> *shape);
static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
static void ParallelFor(const CTask &task, size_t count);
static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis);
static std::vector<size_t> GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y);
};
class BroadcastIterator {
public:
BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
std::vector<size_t> output_shape);
virtual ~BroadcastIterator() = default;
inline size_t GetInputPosA() const { return input_pos_[0]; }
inline size_t GetInputPosB() const { return input_pos_[1]; }
void SetPos(size_t pos);
void GenNextPos();
private:
void BroadcastShape();
void InitStrides();
std::vector<size_t> coordinates_;
std::vector<size_t> input_shape_a_;
std::vector<size_t> input_shape_b_;
std::vector<size_t> output_shape_;
std::vector<size_t> input_strides_a_;
std::vector<size_t> input_strides_b_;
std::vector<size_t> input_back_strides_a_;
std::vector<size_t> input_back_strides_b_;
std::array<size_t, 2> input_pos_{0};
int output_dimension_{0};
};
class TransposeIterator {
public:
TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape);
virtual ~TransposeIterator() = default;
inline size_t GetPos() const { return pos_; }
void SetPos(size_t pos);
void GenNextPos();
private:
int dimension_{0};
std::vector<size_t> coordinates_;
std::vector<size_t> shape_;
std::vector<size_t> strides_;
std::vector<size_t> back_strides_;
std::vector<size_t> axes_;
size_t pos_{0};
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
#include <functional>
#include <memory>
#include <numeric>
#include <string>
#include <thread>
#include <vector>
#include "backend/kernel_compiler/kernel.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "backend/kernel_compiler/common_utils.h"
#include "ir/anf.h"
using mindspore::kernel::Address;
using mindspore::kernel::AddressPtr;
using CTask = std::function<void(size_t, size_t)>;
namespace mindspore {
namespace kernel {
const char KERNEL_SIZE[] = "kernel_size";
const char STRIDE[] = "stride";
const char STRIDES[] = "strides";
const char DILATION[] = "dilation";
const char DILATIONS[] = "dilations";
const char FORMAT[] = "format";
const char PAD[] = "pad";
const char PAD_LIST[] = "pad_list";
const char PAD_MODE[] = "pad_mode";
const char PAD_MODE_LOWER_SAME[] = "same";
const char PAD_MODE_LOWER_VALID[] = "valid";
const char PAD_MODE_UPPER_SAME[] = "SAME";
const char PAD_MODE_UPPER_VALID[] = "VALID";
const char TRANSPOSE_A[] = "transpose_a";
const char TRANSPOSE_B[] = "transpose_b";
const char IS_GRAD[] = "is_grad";
const char TRANSPOSE_NO = 'N';
const char TRANSPOSE_YES = 'T';
const char AXIS[] = "axis";
const char DIM[] = "dim";
const char BEGIN[] = "begin";
const char END[] = "end";
const char SIZE[] = "size";
const char USE_NESTEROV[] = "use_nesterov";
const char GROUP[] = "group";
const char START[] = "start";
const char LIMIT[] = "limit";
const char DELTA[] = "delta";
const char SORTED[] = "sorted";
const char ADJ_ST[] = "adjoint_st";
const char ADJ_dT[] = "adjoint_dt";
enum OperateType {
ADD = 0,
SUB,
MUL,
DIV,
SQUARE,
SQRT,
POW,
REALDIV,
FLOORDIV,
MOD,
FLOORMOD,
NEG,
LESS,
ASSIGNADD,
RELUGRAD,
RELU6GRAD,
ABSGRAD,
TANHGRAD,
SQRTGRAD,
SIGMOIDGRAD,
ONESLIKE,
ZEROSLIKE,
SIGN,
EQUAL,
NOTEQUAL,
LESSEQUAL,
LOGICALAND,
LOGICALOR,
LOGICALNOT,
FLOOR,
SQUAREDDIFFERENCE,
GREATER,
GREATEREQUAL,
RECIPROCAL,
GELU,
GELUGRAD,
ASIN,
ACOS,
ATAN,
ASINGRAD,
ACOSGRAD,
ATANGRAD,
SIN,
COS,
TAN,
SINH,
COSH,
ASINH,
ACOSH,
ATANH,
ASINHGRAD,
ACOSHGRAD,
ATAN2,
RINT,
ROUND,
IDENTITY,
};
class CPUKernel : public kernel::KernelMod {
public:
CPUKernel() = default;
~CPUKernel() override = default;
virtual void Init(const CNodePtr &kernel_node);
virtual void InitKernel(const CNodePtr &kernel_node) = 0;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override {
return Launch(inputs, workspace, outputs);
};
virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) = 0;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
protected:
virtual void InitInputOutputSize(const CNodePtr &kernel_node);
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
};
class CPUKernelUtils {
public:
static void ExpandDimsTo4(std::vector<size_t> *shape);
static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
static void ParallelFor(const CTask &task, size_t count);
static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis);
static std::vector<size_t> GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y);
};
class BroadcastIterator {
public:
BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
std::vector<size_t> output_shape);
virtual ~BroadcastIterator() = default;
inline size_t GetInputPosA() const { return input_pos_[0]; }
inline size_t GetInputPosB() const { return input_pos_[1]; }
void SetPos(size_t pos);
void GenNextPos();
private:
void BroadcastShape();
void InitStrides();
std::vector<size_t> coordinates_;
std::vector<size_t> input_shape_a_;
std::vector<size_t> input_shape_b_;
std::vector<size_t> output_shape_;
std::vector<size_t> input_strides_a_;
std::vector<size_t> input_strides_b_;
std::vector<size_t> input_back_strides_a_;
std::vector<size_t> input_back_strides_b_;
std::array<size_t, 2> input_pos_{0};
int output_dimension_{0};
};
class TransposeIterator {
public:
TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape);
virtual ~TransposeIterator() = default;
inline size_t GetPos() const { return pos_; }
void SetPos(size_t pos);
void GenNextPos();
private:
int dimension_{0};
std::vector<size_t> coordinates_;
std::vector<size_t> shape_;
std::vector<size_t> strides_;
std::vector<size_t> back_strides_;
std::vector<size_t> axes_;
size_t pos_{0};
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_

View File

@ -1,340 +1,340 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {
CheckParam(kernel_node);
probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
if (probs_shape_.size() != 3) {
MS_LOG(EXCEPTION) << "Probs dims: " << probs_shape_.size() << " not support.";
}
if (labels_dims_.size() != 1) {
MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support.";
}
if (indice_dims_.size() != 2) {
MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support.";
}
preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated");
ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated");
ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs");
max_time_ = probs_shape_[0];
batch_size_ = probs_shape_[1];
num_class_ = probs_shape_[2];
blank_index_ = num_class_ - 1;
}
bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
}
return true;
}
template <typename T>
inline T LogSumExp(const T logprob1, const T logprob2) {
T kLogZero_ = -std::numeric_limits<T>::infinity();
if (logprob1 <= kLogZero_) {
return logprob2;
} else if (logprob2 <= kLogZero_) {
return logprob1;
} else {
return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1)))
: logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2)));
}
}
template <typename TT>
void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank,
const std::vector<std::vector<TT>> &y,
std::vector<std::vector<TT>> *log_alpha_b) {
int U = label_with_blank.size();
int T = (*log_alpha_b)[0].size();
TT kLogZero_ = -std::numeric_limits<TT>::infinity();
(*log_alpha_b)[0][0] = static_cast<TT>(log(y[blank_index_][0]));
auto label_0 = (label_with_blank.size() > 1) ? label_with_blank[1] : blank_index_;
if (label_with_blank.size() > 1) {
(*log_alpha_b)[1][0] = static_cast<TT>(log(y[label_0][0]));
}
for (int t = 1; t < T; ++t) {
int low = std::max(0, U - (2 * (T - t)));
int high = std::min(U, 2 * (t + 1));
for (int u = low; u < high; ++u) {
auto sum_log_alpha_b = kLogZero_;
if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
sum_log_alpha_b = (*log_alpha_b)[u][t - 1];
}
if (u > 0) {
sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 1][t - 1]);
}
if (u > 1) {
bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u - 2]);
if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 2][t - 1]);
}
}
(*log_alpha_b)[u][t] =
static_cast<TT>(log(static_cast<TT>(y[label_with_blank[IntToSize(u)]][IntToSize(t)]))) + sum_log_alpha_b;
}
}
}
template <typename TT>
void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank,
const std::vector<std::vector<TT>> &y,
std::vector<std::vector<TT>> *log_beta_b) {
int T = (*log_beta_b)[0].size();
int U = label_with_blank.size();
if (U > 1) {
for (int u = U - 2; u < U; ++u) {
(*log_beta_b)[u][T - 1] = TT(0);
}
} else {
(*log_beta_b)[0][T - 1] = TT(0);
(*log_beta_b)[0][T - 2] = TT(0);
}
for (int t = T - 2; t >= 0; --t) {
int low = std::max(0, U - (2 * (T - t)));
int high = std::min(U, 2 * (t + 1));
for (int u = low; u < high; ++u) {
if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
(*log_beta_b)[u][t] =
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u][t + 1] + TT(log(y[label_with_blank[u]][t + 1])));
}
if (u + 1 < U) {
(*log_beta_b)[u][t] =
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 1][t + 1] + TT(log(y[label_with_blank[u + 1]][t + 1])));
}
if (u + 2 < U) {
bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u + 2]);
if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
(*log_beta_b)[u][t] =
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 2][t + 1] + TT(log(y[label_with_blank[u + 2]][t + 1])));
}
}
}
}
}
template <typename TT>
void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_blank,
const std::vector<std::vector<TT>> &y,
const std::vector<std::vector<TT>> &log_alpha_b,
const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx,
std::vector<std::vector<TT>> *dy) {
auto dy_b = dy;
TT kLogZero_ = -std::numeric_limits<TT>::infinity();
if (log_pzx <= kLogZero_) {
MS_LOG(INFO) << "No valid path found";
return;
}
size_t L = y.size();
size_t T = y[0].size();
size_t U = label_with_blank.size();
for (size_t t = 0; t < T; ++t) {
std::vector<TT> prob_sum(L, kLogZero_);
for (size_t u = 0; u < U; ++u) {
uint32_t l = label_with_blank[u];
prob_sum[l] = LogSumExp(prob_sum[l], log_alpha_b[u][t] + log_beta_b[u][t]);
}
for (size_t l = 0; l < L; ++l) {
(*dy_b)[l][t] = y[l][t] - static_cast<TT>(exp(prob_sum[l] - log_pzx));
}
}
}
void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
std::vector<std::vector<uint32_t>> *label_with_blank) {
for (size_t b = 0; b < batch_size_; ++b) {
std::vector<uint32_t> l;
const std::vector<uint32_t> &label = batch_label[b];
bool has_blank = false;
for (size_t i = 0; i < label.size(); ++i) {
if (i == 0 || !preprocess_collapse_repeated_ || label[i] != label[i - 1]) {
if (label[i] >= num_class_ - 1) {
has_blank = true;
} else {
if (has_blank) {
MS_LOG(EXCEPTION) << "Invalid labels(index >= num_class - 1) should not appear between two valid labels";
}
l.push_back(label[i]);
}
}
}
if (!ignore_longer_outputs_than_inputs_) {
if (l.size() > seq_len[b]) {
MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets "
<< seq_len[b] << "< " << l.size();
}
}
(*label_with_blank)[b].reserve(2 * l.size() + 1);
for (auto l_i : l) {
(*label_with_blank)[b].push_back(blank_index_);
(*label_with_blank)[b].push_back(l_i);
}
(*label_with_blank)[b].push_back(blank_index_);
}
}
template <typename T>
void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length,
size_t num_class, size_t batch_size, size_t b) {
for (size_t t = 0; t < sequence_length; ++t) {
T maxCoeff(T(0));
T sumCoeff(T(0));
for (size_t c = 0; c < num_class; ++c) {
if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) {
maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c];
}
}
for (size_t c = 0; c < num_class; ++c) {
sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
(*softmax_probs)[c][t] =
static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
}
for (size_t c = 0; c < num_class; ++c) {
(*softmax_probs)[c][t] /= sumCoeff;
}
}
}
template <typename T>
void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) {
array2D->resize(row);
for (size_t i = 0; i < row; ++i) {
(*array2D)[i].resize(col, init_value);
}
}
template <typename T>
void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr);
auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr);
auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr);
auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr);
auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr);
std::vector<std::vector<uint32_t>> label_batch;
std::vector<std::vector<uint32_t>> labels_with_blank;
std::vector<uint64_t> each_label_length;
label_batch.resize(batch_size_);
labels_with_blank.resize(batch_size_);
each_label_length.resize(batch_size_, 0);
T kLogZero_ = -std::numeric_limits<T>::infinity();
// check validation of sequence length
for (size_t b = 0; b < batch_size_; ++b) {
if (sequence_length_addr[b] == uint32_t(0)) {
MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b];
}
if (sequence_length_addr[b] > max_time_) {
MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < "
<< sequence_length_addr[b];
}
}
for (size_t i = 0; i < indice_dims_[0]; ++i) {
each_label_length[labels_indices_addr[i * 2]]++;
}
// convert label format of label_value and label_indices to batch_label
uint64_t cum_sum = 0;
for (size_t b = 0; b < batch_size_; ++b) {
std::vector<uint32_t> *b_value = &label_batch[b];
for (size_t l = 0; l < each_label_length[b]; ++l) {
b_value->push_back(labels_values_addr[cum_sum + l]);
}
cum_sum += each_label_length[b];
}
// convert label to label with blank
GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank);
for (size_t b = 0; b < batch_size_; ++b) {
std::vector<uint32_t> label_with_blank = labels_with_blank[b];
// y_b [num_class, sequence_length]
std::vector<std::vector<T>> y_b;
std::vector<std::vector<T>> dy;
std::vector<std::vector<T>> log_alpha_b;
std::vector<std::vector<T>> log_beta_b;
MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_);
MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0));
MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_);
MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_);
InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b);
CalculateFwdVar(label_with_blank, y_b, &log_alpha_b);
CalculateBwdVar(label_with_blank, y_b, &log_beta_b);
T log_pzx = kLogZero_;
for (size_t u = 0; u < label_with_blank.size(); ++u) {
log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]);
}
loss_addr[b] = -log_pzx;
CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy);
for (size_t t = 0; t < sequence_length_addr[b]; ++t) {
for (size_t c = 0; c < num_class_; ++c) {
gradient_addr[t * batch_size_ * num_class_ + b * num_class_ + c] = dy[c][t];
}
}
}
}
void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num;
}
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {
CheckParam(kernel_node);
probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
if (probs_shape_.size() != 3) {
MS_LOG(EXCEPTION) << "Probs dims: " << probs_shape_.size() << " not support.";
}
if (labels_dims_.size() != 1) {
MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support.";
}
if (indice_dims_.size() != 2) {
MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support.";
}
preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated");
ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated");
ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs");
max_time_ = probs_shape_[0];
batch_size_ = probs_shape_[1];
num_class_ = probs_shape_[2];
blank_index_ = num_class_ - 1;
}
bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
}
return true;
}
template <typename T>
inline T LogSumExp(const T logprob1, const T logprob2) {
T kLogZero_ = -std::numeric_limits<T>::infinity();
if (logprob1 <= kLogZero_) {
return logprob2;
} else if (logprob2 <= kLogZero_) {
return logprob1;
} else {
return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1)))
: logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2)));
}
}
template <typename TT>
void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank,
const std::vector<std::vector<TT>> &y,
std::vector<std::vector<TT>> *log_alpha_b) {
int U = label_with_blank.size();
int T = (*log_alpha_b)[0].size();
TT kLogZero_ = -std::numeric_limits<TT>::infinity();
(*log_alpha_b)[0][0] = static_cast<TT>(log(y[blank_index_][0]));
auto label_0 = (label_with_blank.size() > 1) ? label_with_blank[1] : blank_index_;
if (label_with_blank.size() > 1) {
(*log_alpha_b)[1][0] = static_cast<TT>(log(y[label_0][0]));
}
for (int t = 1; t < T; ++t) {
int low = std::max(0, U - (2 * (T - t)));
int high = std::min(U, 2 * (t + 1));
for (int u = low; u < high; ++u) {
auto sum_log_alpha_b = kLogZero_;
if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
sum_log_alpha_b = (*log_alpha_b)[u][t - 1];
}
if (u > 0) {
sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 1][t - 1]);
}
if (u > 1) {
bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u - 2]);
if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 2][t - 1]);
}
}
(*log_alpha_b)[u][t] =
static_cast<TT>(log(static_cast<TT>(y[label_with_blank[IntToSize(u)]][IntToSize(t)]))) + sum_log_alpha_b;
}
}
}
template <typename TT>
void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank,
const std::vector<std::vector<TT>> &y,
std::vector<std::vector<TT>> *log_beta_b) {
int T = (*log_beta_b)[0].size();
int U = label_with_blank.size();
if (U > 1) {
for (int u = U - 2; u < U; ++u) {
(*log_beta_b)[u][T - 1] = TT(0);
}
} else {
(*log_beta_b)[0][T - 1] = TT(0);
(*log_beta_b)[0][T - 2] = TT(0);
}
for (int t = T - 2; t >= 0; --t) {
int low = std::max(0, U - (2 * (T - t)));
int high = std::min(U, 2 * (t + 1));
for (int u = low; u < high; ++u) {
if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
(*log_beta_b)[u][t] =
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u][t + 1] + TT(log(y[label_with_blank[u]][t + 1])));
}
if (u + 1 < U) {
(*log_beta_b)[u][t] =
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 1][t + 1] + TT(log(y[label_with_blank[u + 1]][t + 1])));
}
if (u + 2 < U) {
bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u + 2]);
if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
(*log_beta_b)[u][t] =
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 2][t + 1] + TT(log(y[label_with_blank[u + 2]][t + 1])));
}
}
}
}
}
template <typename TT>
void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_blank,
const std::vector<std::vector<TT>> &y,
const std::vector<std::vector<TT>> &log_alpha_b,
const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx,
std::vector<std::vector<TT>> *dy) {
auto dy_b = dy;
TT kLogZero_ = -std::numeric_limits<TT>::infinity();
if (log_pzx <= kLogZero_) {
MS_LOG(INFO) << "No valid path found";
return;
}
size_t L = y.size();
size_t T = y[0].size();
size_t U = label_with_blank.size();
for (size_t t = 0; t < T; ++t) {
std::vector<TT> prob_sum(L, kLogZero_);
for (size_t u = 0; u < U; ++u) {
uint32_t l = label_with_blank[u];
prob_sum[l] = LogSumExp(prob_sum[l], log_alpha_b[u][t] + log_beta_b[u][t]);
}
for (size_t l = 0; l < L; ++l) {
(*dy_b)[l][t] = y[l][t] - static_cast<TT>(exp(prob_sum[l] - log_pzx));
}
}
}
void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
std::vector<std::vector<uint32_t>> *label_with_blank) {
for (size_t b = 0; b < batch_size_; ++b) {
std::vector<uint32_t> l;
const std::vector<uint32_t> &label = batch_label[b];
bool has_blank = false;
for (size_t i = 0; i < label.size(); ++i) {
if (i == 0 || !preprocess_collapse_repeated_ || label[i] != label[i - 1]) {
if (label[i] >= num_class_ - 1) {
has_blank = true;
} else {
if (has_blank) {
MS_LOG(EXCEPTION) << "Invalid labels(index >= num_class - 1) should not appear between two valid labels";
}
l.push_back(label[i]);
}
}
}
if (!ignore_longer_outputs_than_inputs_) {
if (l.size() > seq_len[b]) {
MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets "
<< seq_len[b] << "< " << l.size();
}
}
(*label_with_blank)[b].reserve(2 * l.size() + 1);
for (auto l_i : l) {
(*label_with_blank)[b].push_back(blank_index_);
(*label_with_blank)[b].push_back(l_i);
}
(*label_with_blank)[b].push_back(blank_index_);
}
}
template <typename T>
void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length,
size_t num_class, size_t batch_size, size_t b) {
for (size_t t = 0; t < sequence_length; ++t) {
T maxCoeff(T(0));
T sumCoeff(T(0));
for (size_t c = 0; c < num_class; ++c) {
if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) {
maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c];
}
}
for (size_t c = 0; c < num_class; ++c) {
sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
(*softmax_probs)[c][t] =
static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
}
for (size_t c = 0; c < num_class; ++c) {
(*softmax_probs)[c][t] /= sumCoeff;
}
}
}
template <typename T>
void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) {
array2D->resize(row);
for (size_t i = 0; i < row; ++i) {
(*array2D)[i].resize(col, init_value);
}
}
template <typename T>
void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr);
auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr);
auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr);
auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr);
auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr);
std::vector<std::vector<uint32_t>> label_batch;
std::vector<std::vector<uint32_t>> labels_with_blank;
std::vector<uint64_t> each_label_length;
label_batch.resize(batch_size_);
labels_with_blank.resize(batch_size_);
each_label_length.resize(batch_size_, 0);
T kLogZero_ = -std::numeric_limits<T>::infinity();
// check validation of sequence length
for (size_t b = 0; b < batch_size_; ++b) {
if (sequence_length_addr[b] == uint32_t(0)) {
MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b];
}
if (sequence_length_addr[b] > max_time_) {
MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < "
<< sequence_length_addr[b];
}
}
for (size_t i = 0; i < indice_dims_[0]; ++i) {
each_label_length[labels_indices_addr[i * 2]]++;
}
// convert label format of label_value and label_indices to batch_label
uint64_t cum_sum = 0;
for (size_t b = 0; b < batch_size_; ++b) {
std::vector<uint32_t> *b_value = &label_batch[b];
for (size_t l = 0; l < each_label_length[b]; ++l) {
b_value->push_back(labels_values_addr[cum_sum + l]);
}
cum_sum += each_label_length[b];
}
// convert label to label with blank
GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank);
for (size_t b = 0; b < batch_size_; ++b) {
std::vector<uint32_t> label_with_blank = labels_with_blank[b];
// y_b [num_class, sequence_length]
std::vector<std::vector<T>> y_b;
std::vector<std::vector<T>> dy;
std::vector<std::vector<T>> log_alpha_b;
std::vector<std::vector<T>> log_beta_b;
MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_);
MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0));
MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_);
MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_);
InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b);
CalculateFwdVar(label_with_blank, y_b, &log_alpha_b);
CalculateBwdVar(label_with_blank, y_b, &log_beta_b);
T log_pzx = kLogZero_;
for (size_t u = 0; u < label_with_blank.size(); ++u) {
log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]);
}
loss_addr[b] = -log_pzx;
CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy);
for (size_t t = 0; t < sequence_length_addr[b]; ++t) {
for (size_t c = 0; c < num_class_; ++c) {
gradient_addr[t * batch_size_ * num_class_ + b * num_class_ + c] = dy[c][t];
}
}
}
}
void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num;
}
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,92 +1,92 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
#include <memory>
#include <unordered_map>
#include <vector>
#include <algorithm>
#include <limits>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class CTCLossCPUKernel : public CPUKernel {
public:
CTCLossCPUKernel() = default;
~CTCLossCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
std::vector<std::vector<uint32_t>> *label_with_blank);
template <typename T>
void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
std::vector<std::vector<T>> *log_alpha_b);
template <typename T>
void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
std::vector<std::vector<T>> *log_beta_b);
template <typename T>
void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b,
const T log_pzx, std::vector<std::vector<T>> *dy);
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
private:
void CheckParam(const CNodePtr &kernel_node);
std::vector<size_t> probs_shape_;
std::vector<size_t> indice_dims_;
std::vector<size_t> labels_dims_;
size_t num_class_;
size_t max_time_;
size_t batch_size_;
uint32_t blank_index_;
TypeId dtype_{kTypeUnknown};
bool preprocess_collapse_repeated_;
bool ctc_merge_repeated_;
bool ignore_longer_outputs_than_inputs_;
};
MS_REG_CPU_KERNEL(CTCLoss,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
CTCLossCPUKernel);
MS_REG_CPU_KERNEL(CTCLoss,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
CTCLossCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
#include <memory>
#include <unordered_map>
#include <vector>
#include <algorithm>
#include <limits>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class CTCLossCPUKernel : public CPUKernel {
public:
CTCLossCPUKernel() = default;
~CTCLossCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
std::vector<std::vector<uint32_t>> *label_with_blank);
template <typename T>
void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
std::vector<std::vector<T>> *log_alpha_b);
template <typename T>
void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
std::vector<std::vector<T>> *log_beta_b);
template <typename T>
void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b,
const T log_pzx, std::vector<std::vector<T>> *dy);
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
private:
void CheckParam(const CNodePtr &kernel_node);
std::vector<size_t> probs_shape_;
std::vector<size_t> indice_dims_;
std::vector<size_t> labels_dims_;
size_t num_class_;
size_t max_time_;
size_t batch_size_;
uint32_t blank_index_;
TypeId dtype_{kTypeUnknown};
bool preprocess_collapse_repeated_;
bool ctc_merge_repeated_;
bool ignore_longer_outputs_than_inputs_;
};
MS_REG_CPU_KERNEL(CTCLoss,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
CTCLossCPUKernel);
MS_REG_CPU_KERNEL(CTCLoss,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
CTCLossCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_

View File

@ -1,89 +1,89 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h"
#include <vector>
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
template <typename T>
void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
CheckParam(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
}
template <typename T>
bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> & /*workspace*/,
const std::vector<kernel::AddressPtr> &outputs) {
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
size_t size = IntToSize(inputs[0]->size / sizeof(T));
std::vector<size_t> input_shape = input_shape_;
std::vector<size_t> output_shape = output_shape_;
size_t block_size = block_size_;
size_t input_dimension = input_shape.size();
size_t output_strides[3] = {1, 1, 1};
for (size_t i = input_dimension - 1; i >= 1; --i) {
for (size_t j = 0; j < i; ++j) {
output_strides[j] *= output_shape[i];
}
}
auto task = [&, input_addr, output_addr](size_t start, size_t end) {
std::vector<size_t> output_pos_array(input_dimension, 0);
for (size_t i = start; i < end; ++i) {
size_t tmp_pos = i;
for (size_t j = 0; j < input_dimension - 1; ++j) {
output_pos_array[j] = tmp_pos / output_strides[j];
tmp_pos %= output_strides[j];
}
output_pos_array.back() = tmp_pos;
size_t input_pos = output_pos_array[0];
input_pos =
(input_pos * input_shape[1]) +
(output_pos_array[1] +
(block_size * (output_pos_array[2] % block_size) + output_pos_array[3] % block_size) * output_shape[1]);
input_pos = (input_pos * input_shape[2]) + (output_pos_array[2] / block_size);
input_pos = (input_pos * input_shape[3]) + (output_pos_array[3] / block_size);
output_addr[i] = input_addr[input_pos];
}
};
CPUKernelUtils::ParallelFor(task, size);
return true;
}
template <typename T>
void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
}
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h"
#include <vector>
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
template <typename T>
void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
CheckParam(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
}
template <typename T>
bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> & /*workspace*/,
const std::vector<kernel::AddressPtr> &outputs) {
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
size_t size = IntToSize(inputs[0]->size / sizeof(T));
std::vector<size_t> input_shape = input_shape_;
std::vector<size_t> output_shape = output_shape_;
size_t block_size = block_size_;
size_t input_dimension = input_shape.size();
size_t output_strides[3] = {1, 1, 1};
for (size_t i = input_dimension - 1; i >= 1; --i) {
for (size_t j = 0; j < i; ++j) {
output_strides[j] *= output_shape[i];
}
}
auto task = [&, input_addr, output_addr](size_t start, size_t end) {
std::vector<size_t> output_pos_array(input_dimension, 0);
for (size_t i = start; i < end; ++i) {
size_t tmp_pos = i;
for (size_t j = 0; j < input_dimension - 1; ++j) {
output_pos_array[j] = tmp_pos / output_strides[j];
tmp_pos %= output_strides[j];
}
output_pos_array.back() = tmp_pos;
size_t input_pos = output_pos_array[0];
input_pos =
(input_pos * input_shape[1]) +
(output_pos_array[1] +
(block_size * (output_pos_array[2] % block_size) + output_pos_array[3] % block_size) * output_shape[1]);
input_pos = (input_pos * input_shape[2]) + (output_pos_array[2] / block_size);
input_pos = (input_pos * input_shape[3]) + (output_pos_array[3] / block_size);
output_addr[i] = input_addr[input_pos];
}
};
CPUKernelUtils::ParallelFor(task, size);
return true;
}
template <typename T>
void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
}
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,85 +1,85 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
#include <memory>
#include <string>
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
template <typename T>
class DepthToSpaceCPUKernel : public CPUKernel {
public:
DepthToSpaceCPUKernel() = default;
~DepthToSpaceCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
void CheckParam(const CNodePtr &kernel_node);
std::vector<size_t> input_shape_;
std::vector<size_t> output_shape_;
size_t block_size_;
};
MS_REG_CPU_KERNEL_T(
DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
DepthToSpaceCPUKernel, float);
MS_REG_CPU_KERNEL_T(
DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
DepthToSpaceCPUKernel, float16);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
DepthToSpaceCPUKernel, int8_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
DepthToSpaceCPUKernel, int16_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
DepthToSpaceCPUKernel, int);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
DepthToSpaceCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
DepthToSpaceCPUKernel, uint8_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
DepthToSpaceCPUKernel, uint16_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
DepthToSpaceCPUKernel, uint32_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
DepthToSpaceCPUKernel, uint64_t);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
#include <memory>
#include <string>
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
template <typename T>
class DepthToSpaceCPUKernel : public CPUKernel {
public:
DepthToSpaceCPUKernel() = default;
~DepthToSpaceCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
void CheckParam(const CNodePtr &kernel_node);
std::vector<size_t> input_shape_;
std::vector<size_t> output_shape_;
size_t block_size_;
};
MS_REG_CPU_KERNEL_T(
DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
DepthToSpaceCPUKernel, float);
MS_REG_CPU_KERNEL_T(
DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
DepthToSpaceCPUKernel, float16);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
DepthToSpaceCPUKernel, int8_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
DepthToSpaceCPUKernel, int16_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
DepthToSpaceCPUKernel, int);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
DepthToSpaceCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
DepthToSpaceCPUKernel, uint8_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
DepthToSpaceCPUKernel, uint16_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
DepthToSpaceCPUKernel, uint32_t);
MS_REG_CPU_KERNEL_T(DepthToSpace,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
DepthToSpaceCPUKernel, uint64_t);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_

View File

@ -1,102 +1,102 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h"
#include "backend/kernel_compiler/cpu/nnacl/errorcode.h"
#include "utils/ms_utils.h"
#include "common/thread_pool.h"
namespace mindspore {
namespace kernel {
void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) {
int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start);
if (ret != NNACL_OK) {
MS_LOG(EXCEPTION) << "Add failed.";
}
}
void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
CheckParam(kernel_node);
input_num_ = AnfAlgo::GetInputTensorNum(kernel_node);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape);
dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape);
dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape);
dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc);
auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
primitive_ = std::make_shared<dnnl::binary>(prim_desc);
AddArgument(DNNL_ARG_SRC_0, src0_mem_desc);
AddArgument(DNNL_ARG_SRC_1, src1_mem_desc);
AddArgument(DNNL_ARG_DST, dst_mem_desc);
}
bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (dtype_ == kNumberTypeFloat32) {
SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
ExecutePrimitive();
for (size_t index = 2; index < input_num_; ++index) {
SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr);
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr);
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
ExecutePrimitive();
}
} else if (dtype_ == kNumberTypeInt32) {
size_t elements_num = outputs[0]->size / sizeof(int);
const auto input_0 = reinterpret_cast<int *>(inputs[0]->addr);
const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr);
auto output = reinterpret_cast<int *>(outputs[0]->addr);
auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
CPUKernelUtils::ParallelFor(task_0, elements_num);
for (size_t index = 2; index < input_num_; ++index) {
const auto input = reinterpret_cast<int *>(inputs[index]->addr);
auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2);
CPUKernelUtils::ParallelFor(task, elements_num);
}
} else {
MS_LOG(EXCEPTION) << "AddN only support float32 and int32, but got " << TypeIdToType(dtype_)->ToString();
}
return true;
}
void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) {
auto src0_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
if (src0_shape != dst_shape) {
MS_LOG(EXCEPTION) << "AddN output shape must be equal to input shape.";
}
for (size_t index = 1; index < input_num_; ++index) {
auto src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index);
if (src0_shape != src_shape) {
MS_LOG(EXCEPTION) << "AddN input shapes must be equal.";
}
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output.";
}
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h"
#include "backend/kernel_compiler/cpu/nnacl/errorcode.h"
#include "utils/ms_utils.h"
#include "common/thread_pool.h"
namespace mindspore {
namespace kernel {
void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) {
int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start);
if (ret != NNACL_OK) {
MS_LOG(EXCEPTION) << "Add failed.";
}
}
void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
CheckParam(kernel_node);
input_num_ = AnfAlgo::GetInputTensorNum(kernel_node);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape);
dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape);
dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape);
dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc);
auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
primitive_ = std::make_shared<dnnl::binary>(prim_desc);
AddArgument(DNNL_ARG_SRC_0, src0_mem_desc);
AddArgument(DNNL_ARG_SRC_1, src1_mem_desc);
AddArgument(DNNL_ARG_DST, dst_mem_desc);
}
bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (dtype_ == kNumberTypeFloat32) {
SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
ExecutePrimitive();
for (size_t index = 2; index < input_num_; ++index) {
SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr);
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr);
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
ExecutePrimitive();
}
} else if (dtype_ == kNumberTypeInt32) {
size_t elements_num = outputs[0]->size / sizeof(int);
const auto input_0 = reinterpret_cast<int *>(inputs[0]->addr);
const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr);
auto output = reinterpret_cast<int *>(outputs[0]->addr);
auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
CPUKernelUtils::ParallelFor(task_0, elements_num);
for (size_t index = 2; index < input_num_; ++index) {
const auto input = reinterpret_cast<int *>(inputs[index]->addr);
auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2);
CPUKernelUtils::ParallelFor(task, elements_num);
}
} else {
MS_LOG(EXCEPTION) << "AddN only support float32 and int32, but got " << TypeIdToType(dtype_)->ToString();
}
return true;
}
void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) {
auto src0_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
if (src0_shape != dst_shape) {
MS_LOG(EXCEPTION) << "AddN output shape must be equal to input shape.";
}
for (size_t index = 1; index < input_num_; ++index) {
auto src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index);
if (src0_shape != src_shape) {
MS_LOG(EXCEPTION) << "AddN input shapes must be equal.";
}
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output.";
}
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,51 +1,51 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
namespace mindspore {
namespace kernel {
class AddNCPUKernel : public MKLCPUKernel {
public:
AddNCPUKernel() = default;
~AddNCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
void CheckParam(const CNodePtr &kernel_node);
size_t input_num_{0};
std::vector<size_t> output_shape_;
TypeId dtype_{kNumberTypeFloat32};
};
MS_REG_CPU_KERNEL(AddN,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
AddNCPUKernel);
MS_REG_CPU_KERNEL(AddN,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
AddNCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
namespace mindspore {
namespace kernel {
class AddNCPUKernel : public MKLCPUKernel {
public:
AddNCPUKernel() = default;
~AddNCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
void CheckParam(const CNodePtr &kernel_node);
size_t input_num_{0};
std::vector<size_t> output_shape_;
TypeId dtype_{kNumberTypeFloat32};
};
MS_REG_CPU_KERNEL(AddN,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
AddNCPUKernel);
MS_REG_CPU_KERNEL(AddN,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
AddNCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_

View File

@ -1,178 +1,178 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h"
#include <string>
#include "utils/ms_utils.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
const int kMaxLSTMLayer = 100;
const int kOutputWorkSpaceIndex = 3;
void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
CPUKernel::InitInputOutputSize(kernel_node);
output_size_list_[kOutputWorkSpaceIndex] = reserve_size_;
auto output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0);
auto output_types = std::vector<TypeId>(output_num, output_type);
std::vector<std::vector<size_t>> output_shapes;
for (size_t output_index = 0; output_index < output_num; ++output_index) {
std::vector<size_t> shape = AnfAlgo::GetOutputInferShape(kernel_node, output_index);
output_shapes.emplace_back(shape);
}
size_t len = reserve_size_ / 4;
output_shapes[kOutputWorkSpaceIndex] = {len, 1};
AnfAlgo::SetOutputInferTypeAndShape(output_types, output_shapes, kernel_node.get());
}
void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
#ifdef PLATFORM_86
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
MS_EXCEPTION_IF_NULL(kernel_node);
using tag = dnnl::memory::format_tag;
using dim = dnnl::memory::dims;
CheckParam(kernel_node);
auto eng = MKLKernelEngine::Get().engine();
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
if (bidirectional_) {
direction = dnnl::rnn_direction::bidirectional_concat;
}
dim src_dims = {seq_len_, batch_size_, input_size_};
dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
if (!kernel_node->HasAttr(kAttrIsTraining)) {
is_training = true;
} else {
is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining));
}
auto prop_kind = dnnl::prop_kind::forward_training;
if (!is_training) {
prop_kind = dnnl::prop_kind::forward_inference;
}
auto desc = std::make_shared<dnnl::lstm_forward::desc>(
prop_kind, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc);
prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng);
primitive_ = std::make_shared<dnnl::lstm_forward>(prim_desc_);
if (is_training) {
reserve_size_ = static_cast<size_t>(prim_desc_.workspace_desc().get_size());
AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc());
} else {
reserve_size_ = 1;
}
AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc());
AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc());
AddArgument(DNNL_ARG_BIAS, bias_desc);
AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
}
void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) {
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"));
hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"));
num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"));
has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
batch_size_ = SizeToInt(src_shape[1]);
seq_len_ = SizeToInt(src_shape[0]);
num_directions_ = 1;
if (bidirectional_) {
num_directions_ = 2;
}
const int gate_size = 4 * hidden_size_;
if (num_layers_ <= 0) {
MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
}
if (num_layers_ > kMaxLSTMLayer) {
MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
}
for (int i = 0; i < num_layers_; ++i) {
weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
weight_h_size_ += gate_size * hidden_size_;
}
weight_size_ = weight_size_ * num_directions_;
weight_h_size_ = weight_h_size_ * num_directions_;
if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) {
MS_LOG(EXCEPTION) << "Error iteration shape!";
}
if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
}
}
bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
using dt = dnnl::memory::data_type;
using tag = dnnl::memory::format_tag;
auto eng = MKLKernelEngine::Get().engine();
auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng);
auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng);
user_weights_memory.set_data_handle(inputs[3]->addr);
user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
Reorder(&user_weights_memory, &weights_memory);
Reorder(&user_weights_h_memory, &weights_h_memory);
auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng);
if (has_bias_) {
bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
} else {
if (memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0,
prim_desc_.bias_desc().get_size())) {
MS_LOG(EXCEPTION) << "Bias memset error";
}
}
// set handle
SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr);
SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr);
SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr);
if (is_training) {
SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr);
}
ExecutePrimitive();
return true;
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h"
#include <string>
#include "utils/ms_utils.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
const int kMaxLSTMLayer = 100;
const int kOutputWorkSpaceIndex = 3;
void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
CPUKernel::InitInputOutputSize(kernel_node);
output_size_list_[kOutputWorkSpaceIndex] = reserve_size_;
auto output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0);
auto output_types = std::vector<TypeId>(output_num, output_type);
std::vector<std::vector<size_t>> output_shapes;
for (size_t output_index = 0; output_index < output_num; ++output_index) {
std::vector<size_t> shape = AnfAlgo::GetOutputInferShape(kernel_node, output_index);
output_shapes.emplace_back(shape);
}
size_t len = reserve_size_ / 4;
output_shapes[kOutputWorkSpaceIndex] = {len, 1};
AnfAlgo::SetOutputInferTypeAndShape(output_types, output_shapes, kernel_node.get());
}
void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
#ifdef PLATFORM_86
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
MS_EXCEPTION_IF_NULL(kernel_node);
using tag = dnnl::memory::format_tag;
using dim = dnnl::memory::dims;
CheckParam(kernel_node);
auto eng = MKLKernelEngine::Get().engine();
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
if (bidirectional_) {
direction = dnnl::rnn_direction::bidirectional_concat;
}
dim src_dims = {seq_len_, batch_size_, input_size_};
dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
if (!kernel_node->HasAttr(kAttrIsTraining)) {
is_training = true;
} else {
is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining));
}
auto prop_kind = dnnl::prop_kind::forward_training;
if (!is_training) {
prop_kind = dnnl::prop_kind::forward_inference;
}
auto desc = std::make_shared<dnnl::lstm_forward::desc>(
prop_kind, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc);
prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng);
primitive_ = std::make_shared<dnnl::lstm_forward>(prim_desc_);
if (is_training) {
reserve_size_ = static_cast<size_t>(prim_desc_.workspace_desc().get_size());
AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc());
} else {
reserve_size_ = 1;
}
AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc());
AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc());
AddArgument(DNNL_ARG_BIAS, bias_desc);
AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
}
void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) {
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"));
hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"));
num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"));
has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
batch_size_ = SizeToInt(src_shape[1]);
seq_len_ = SizeToInt(src_shape[0]);
num_directions_ = 1;
if (bidirectional_) {
num_directions_ = 2;
}
const int gate_size = 4 * hidden_size_;
if (num_layers_ <= 0) {
MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
}
if (num_layers_ > kMaxLSTMLayer) {
MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
}
for (int i = 0; i < num_layers_; ++i) {
weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
weight_h_size_ += gate_size * hidden_size_;
}
weight_size_ = weight_size_ * num_directions_;
weight_h_size_ = weight_h_size_ * num_directions_;
if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) {
MS_LOG(EXCEPTION) << "Error iteration shape!";
}
if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
}
}
bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
using dt = dnnl::memory::data_type;
using tag = dnnl::memory::format_tag;
auto eng = MKLKernelEngine::Get().engine();
auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng);
auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng);
user_weights_memory.set_data_handle(inputs[3]->addr);
user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
Reorder(&user_weights_memory, &weights_memory);
Reorder(&user_weights_h_memory, &weights_h_memory);
auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng);
if (has_bias_) {
bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
} else {
if (memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0,
prim_desc_.bias_desc().get_size())) {
MS_LOG(EXCEPTION) << "Bias memset error";
}
}
// set handle
SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr);
SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr);
SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr);
if (is_training) {
SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr);
}
ExecutePrimitive();
return true;
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,76 +1,76 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
#define PLATFORM_86
#endif
#ifdef PLATFORM_86
#include <pmmintrin.h>
#endif
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
namespace mindspore {
namespace kernel {
class LstmCPUKernel : public MKLCPUKernel {
public:
LstmCPUKernel() = default;
~LstmCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
protected:
void InitInputOutputSize(const CNodePtr &kernel_node) override;
private:
void CheckParam(const CNodePtr &kernel_node);
int weight_size_ = 0;
int weight_h_size_ = 0;
int input_size_;
int hidden_size_;
int num_layers_;
int batch_size_;
int seq_len_;
int num_directions_;
bool bidirectional_;
bool has_bias_;
size_t reserve_size_;
bool is_training;
dnnl::memory::dims weights_dims_;
dnnl::memory::dims weights_h_dims_;
dnnl::memory::dims bias_dims_;
dnnl::lstm_forward::primitive_desc prim_desc_;
};
MS_REG_CPU_KERNEL(LSTM,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
LstmCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
#define PLATFORM_86
#endif
#ifdef PLATFORM_86
#include <pmmintrin.h>
#endif
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
namespace mindspore {
namespace kernel {
class LstmCPUKernel : public MKLCPUKernel {
public:
LstmCPUKernel() = default;
~LstmCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
protected:
void InitInputOutputSize(const CNodePtr &kernel_node) override;
private:
void CheckParam(const CNodePtr &kernel_node);
int weight_size_ = 0;
int weight_h_size_ = 0;
int input_size_;
int hidden_size_;
int num_layers_;
int batch_size_;
int seq_len_;
int num_directions_;
bool bidirectional_;
bool has_bias_;
size_t reserve_size_;
bool is_training;
dnnl::memory::dims weights_dims_;
dnnl::memory::dims weights_h_dims_;
dnnl::memory::dims bias_dims_;
dnnl::lstm_forward::primitive_desc prim_desc_;
};
MS_REG_CPU_KERNEL(LSTM,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
LstmCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H

View File

@ -1,218 +1,218 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h"
#include <cstring>
#include <string>
#include "utils/ms_utils.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
const int kMaxLSTMLayer = 100;
const int kInputWorkSpaceIndex = 10;
void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
CPUKernel::InitInputOutputSize(kernel_node);
input_size_list_[kInputWorkSpaceIndex] = reserve_size_;
}
void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
using tag = dnnl::memory::format_tag;
using dim = dnnl::memory::dims;
CheckParam(kernel_node);
auto eng = MKLKernelEngine::Get().engine();
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
if (bidirectional_) {
direction = dnnl::rnn_direction::bidirectional_concat;
}
dim src_dims = {seq_len_, batch_size_, input_size_};
dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
auto forward_desc = std::make_shared<dnnl::lstm_forward::desc>(
dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc,
dst_c_desc);
auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng);
auto backward_desc = std::make_shared<dnnl::lstm_backward::desc>(
dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc,
dst_h_desc, dst_c_desc);
prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc);
primitive_ = std::make_shared<dnnl::lstm_backward>(prim_backward_desc_);
reserve_size_ = static_cast<size_t>(prim_forward_desc.workspace_desc().get_size());
AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc());
AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
}
void LSTMGradCPUKernel::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
const dnnl::memory::desc &dst_c_desc) {
AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc());
AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc());
AddArgument(DNNL_ARG_BIAS, bias_desc);
AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc);
AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc);
AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc);
AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc());
AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc());
AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc);
AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc);
AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc);
AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc);
}
void LSTMGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
input_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size");
hidden_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size");
num_layers_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers");
has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
batch_size_ = SizeToInt(src_shape[1]);
seq_len_ = SizeToInt(src_shape[0]);
num_directions_ = 1;
if (bidirectional_) {
num_directions_ = 2;
}
const int64_t gate_size = 4 * hidden_size_;
if (num_layers_ <= 0) {
MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
}
if (num_layers_ > kMaxLSTMLayer) {
MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
}
for (int64_t i = 0; i < num_layers_; ++i) {
weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
weight_h_size_ += gate_size * hidden_size_;
}
weight_size_ = weight_size_ * num_directions_;
weight_h_size_ = weight_h_size_ * num_directions_;
if (num_directions_ * num_layers_ != SizeToLong(src_h_shape[0])) {
MS_LOG(EXCEPTION) << "Error iteration shape!";
}
if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
}
}
void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &outputs,
const dnnl::memory &weights_memory, const dnnl::memory &weights_h_memory,
const dnnl::memory &bias_memory, const dnnl::memory &diff_weights_memory,
const dnnl::memory &diff_weights_h_memory,
const dnnl::memory &diff_bias_memory) {
SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr);
SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr);
SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr);
SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr);
}
void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const {
if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) {
MS_LOG(EXCEPTION) << name << " memset error";
}
}
bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
using dt = dnnl::memory::data_type;
using tag = dnnl::memory::format_tag;
auto eng = MKLKernelEngine::Get().engine();
// construct fw memory
auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng);
auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng);
auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng);
user_weights_memory.set_data_handle(inputs[3]->addr);
user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
Reorder(&user_weights_memory, &weights_memory);
Reorder(&user_weights_h_memory, &weights_h_memory);
if (has_bias_) {
bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
} else {
if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0,
prim_backward_desc_.bias_desc().get_size())) {
MS_LOG(EXCEPTION) << "Bias memset error";
}
}
// construct bw memory
auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng);
auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng);
auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng);
auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
user_diff_weights_memory.set_data_handle(outputs[3]->addr);
user_diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
ResetMemory(user_diff_weights_memory, "user weights grad");
ResetMemory(user_diff_weights_h_memory, "user weights iter grad");
ResetMemory(diff_weights_memory, "weights grad");
ResetMemory(diff_weights_h_memory, "weights iter grad");
if (has_bias_) {
diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
}
if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0,
prim_backward_desc_.diff_bias_desc().get_size())) {
MS_LOG(EXCEPTION) << "Bias grad memset error";
}
SetArgumentHandleOp(inputs, outputs, weights_memory, weights_h_memory, bias_memory, diff_weights_memory,
diff_weights_h_memory, diff_bias_memory);
ExecutePrimitive();
Reorder(&diff_weights_memory, &user_diff_weights_memory);
Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory);
return true;
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h"
#include <cstring>
#include <string>
#include "utils/ms_utils.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
const int kMaxLSTMLayer = 100;
const int kInputWorkSpaceIndex = 10;
void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
CPUKernel::InitInputOutputSize(kernel_node);
input_size_list_[kInputWorkSpaceIndex] = reserve_size_;
}
void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
using tag = dnnl::memory::format_tag;
using dim = dnnl::memory::dims;
CheckParam(kernel_node);
auto eng = MKLKernelEngine::Get().engine();
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
if (bidirectional_) {
direction = dnnl::rnn_direction::bidirectional_concat;
}
dim src_dims = {seq_len_, batch_size_, input_size_};
dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
auto forward_desc = std::make_shared<dnnl::lstm_forward::desc>(
dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc,
dst_c_desc);
auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng);
auto backward_desc = std::make_shared<dnnl::lstm_backward::desc>(
dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc,
dst_h_desc, dst_c_desc);
prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc);
primitive_ = std::make_shared<dnnl::lstm_backward>(prim_backward_desc_);
reserve_size_ = static_cast<size_t>(prim_forward_desc.workspace_desc().get_size());
AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc());
AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
}
void LSTMGradCPUKernel::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
const dnnl::memory::desc &dst_c_desc) {
AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc());
AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc());
AddArgument(DNNL_ARG_BIAS, bias_desc);
AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc);
AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc);
AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc);
AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc());
AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc());
AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc);
AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc);
AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc);
AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc);
}
void LSTMGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
input_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size");
hidden_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size");
num_layers_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers");
has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
batch_size_ = SizeToInt(src_shape[1]);
seq_len_ = SizeToInt(src_shape[0]);
num_directions_ = 1;
if (bidirectional_) {
num_directions_ = 2;
}
const int64_t gate_size = 4 * hidden_size_;
if (num_layers_ <= 0) {
MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
}
if (num_layers_ > kMaxLSTMLayer) {
MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
}
for (int64_t i = 0; i < num_layers_; ++i) {
weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
weight_h_size_ += gate_size * hidden_size_;
}
weight_size_ = weight_size_ * num_directions_;
weight_h_size_ = weight_h_size_ * num_directions_;
if (num_directions_ * num_layers_ != SizeToLong(src_h_shape[0])) {
MS_LOG(EXCEPTION) << "Error iteration shape!";
}
if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
}
}
void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &outputs,
const dnnl::memory &weights_memory, const dnnl::memory &weights_h_memory,
const dnnl::memory &bias_memory, const dnnl::memory &diff_weights_memory,
const dnnl::memory &diff_weights_h_memory,
const dnnl::memory &diff_bias_memory) {
SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr);
SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr);
SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr);
SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle());
SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr);
}
void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const {
if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) {
MS_LOG(EXCEPTION) << name << " memset error";
}
}
bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
using dt = dnnl::memory::data_type;
using tag = dnnl::memory::format_tag;
auto eng = MKLKernelEngine::Get().engine();
// construct fw memory
auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng);
auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng);
auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng);
user_weights_memory.set_data_handle(inputs[3]->addr);
user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
Reorder(&user_weights_memory, &weights_memory);
Reorder(&user_weights_h_memory, &weights_h_memory);
if (has_bias_) {
bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
} else {
if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0,
prim_backward_desc_.bias_desc().get_size())) {
MS_LOG(EXCEPTION) << "Bias memset error";
}
}
// construct bw memory
auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng);
auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng);
auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng);
auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
user_diff_weights_memory.set_data_handle(outputs[3]->addr);
user_diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
ResetMemory(user_diff_weights_memory, "user weights grad");
ResetMemory(user_diff_weights_h_memory, "user weights iter grad");
ResetMemory(diff_weights_memory, "weights grad");
ResetMemory(diff_weights_h_memory, "weights iter grad");
if (has_bias_) {
diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
}
if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0,
prim_backward_desc_.diff_bias_desc().get_size())) {
MS_LOG(EXCEPTION) << "Bias grad memset error";
}
SetArgumentHandleOp(inputs, outputs, weights_memory, weights_h_memory, bias_memory, diff_weights_memory,
diff_weights_h_memory, diff_bias_memory);
ExecutePrimitive();
Reorder(&diff_weights_memory, &user_diff_weights_memory);
Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory);
return true;
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,87 +1,87 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
#include <string>
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
namespace mindspore {
namespace kernel {
class LSTMGradCPUKernel : public MKLCPUKernel {
public:
LSTMGradCPUKernel() = default;
~LSTMGradCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
protected:
void InitInputOutputSize(const CNodePtr &kernel_node) override;
private:
void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
const dnnl::memory::desc &dst_c_desc);
void SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &outputs, const dnnl::memory &weights_memory,
const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory,
const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory,
const dnnl::memory &diff_bias_memory);
void ResetMemory(const dnnl::memory &mem, const string name) const;
void CheckParam(const CNodePtr &kernel_node);
int64_t weight_size_ = 0;
int64_t weight_h_size_ = 0;
int64_t input_size_;
int64_t hidden_size_;
int64_t num_layers_;
int64_t batch_size_;
int64_t seq_len_;
int num_directions_;
bool bidirectional_;
bool has_bias_;
size_t reserve_size_;
dnnl::memory::dims weights_dims_;
dnnl::memory::dims weights_h_dims_;
dnnl::memory::dims bias_dims_;
dnnl::lstm_backward::primitive_desc prim_backward_desc_;
};
MS_REG_CPU_KERNEL(LSTMGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
LSTMGradCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
#include <string>
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
namespace mindspore {
namespace kernel {
class LSTMGradCPUKernel : public MKLCPUKernel {
public:
LSTMGradCPUKernel() = default;
~LSTMGradCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
protected:
void InitInputOutputSize(const CNodePtr &kernel_node) override;
private:
void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
const dnnl::memory::desc &dst_c_desc);
void SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &outputs, const dnnl::memory &weights_memory,
const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory,
const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory,
const dnnl::memory &diff_bias_memory);
void ResetMemory(const dnnl::memory &mem, const string name) const;
void CheckParam(const CNodePtr &kernel_node);
int64_t weight_size_ = 0;
int64_t weight_h_size_ = 0;
int64_t input_size_;
int64_t hidden_size_;
int64_t num_layers_;
int64_t batch_size_;
int64_t seq_len_;
int num_directions_;
bool bidirectional_;
bool has_bias_;
size_t reserve_size_;
dnnl::memory::dims weights_dims_;
dnnl::memory::dims weights_h_dims_;
dnnl::memory::dims bias_dims_;
dnnl::lstm_backward::primitive_desc prim_backward_desc_;
};
MS_REG_CPU_KERNEL(LSTMGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
LSTMGradCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_

View File

@ -1,99 +1,99 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h"
#include <numeric>
#include <functional>
#include <cmath>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"
namespace mindspore {
namespace kernel {
void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
CPUKernel::InitInputOutputSize(kernel_node);
MS_EXCEPTION_IF_NULL(kernel_node);
size_t type_size = sizeof(float);
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
workspace_size_list_.emplace_back(tensor_size);
}
void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
dnnl::memory::dims mem_dims;
mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
if (mem_dims.size() != 2) {
MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size();
}
batch_size_ = shape[0];
class_num_ = shape[1];
if (batch_size_ == 0 || class_num_ == 0) {
MS_LOG(EXCEPTION) << "Invalid batch size or class num input!";
}
dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc);
dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1);
auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
primitive_ = std::make_shared<dnnl::softmax_forward>(prim_desc);
AddArgument(DNNL_ARG_SRC, mem_desc);
AddArgument(DNNL_ARG_DST, mem_desc);
}
void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels,
float *output1, float *output2) const {
float epsilon = 1e-6;
for (size_t i = 0; i < batch_size_; ++i) {
output1[i] = 0;
float loss = 0.0;
for (size_t j = 0; j < class_num_; ++j) {
float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]);
output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j];
loss += labels[i * class_num_ + j] * logit;
}
output1[i] = -loss;
}
}
bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.empty() || workspace.empty() || outputs.empty()) {
MS_LOG(EXCEPTION) << "Error input output size!";
}
size_t batch_float_size = batch_size_ * sizeof(float);
size_t batch_class_float_size = class_num_ * batch_float_size;
if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size ||
inputs[1]->size != batch_class_float_size) {
MS_LOG(EXCEPTION) << "Error input data size!";
}
if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) {
MS_LOG(EXCEPTION) << "Error output data size!";
}
SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr);
ExecutePrimitive();
auto labels = reinterpret_cast<float *>(inputs[1]->addr);
auto logits = reinterpret_cast<float *>(workspace[0]->addr);
auto output1 = reinterpret_cast<float *>(outputs[0]->addr);
auto output2 = reinterpret_cast<float *>(outputs[1]->addr);
ForwardPostExecute(logits, labels, output1, output2);
return true;
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h"
#include <numeric>
#include <functional>
#include <cmath>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"
namespace mindspore {
namespace kernel {
void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
CPUKernel::InitInputOutputSize(kernel_node);
MS_EXCEPTION_IF_NULL(kernel_node);
size_t type_size = sizeof(float);
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
workspace_size_list_.emplace_back(tensor_size);
}
void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
dnnl::memory::dims mem_dims;
mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
if (mem_dims.size() != 2) {
MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size();
}
batch_size_ = shape[0];
class_num_ = shape[1];
if (batch_size_ == 0 || class_num_ == 0) {
MS_LOG(EXCEPTION) << "Invalid batch size or class num input!";
}
dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc);
dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1);
auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
primitive_ = std::make_shared<dnnl::softmax_forward>(prim_desc);
AddArgument(DNNL_ARG_SRC, mem_desc);
AddArgument(DNNL_ARG_DST, mem_desc);
}
void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels,
float *output1, float *output2) const {
float epsilon = 1e-6;
for (size_t i = 0; i < batch_size_; ++i) {
output1[i] = 0;
float loss = 0.0;
for (size_t j = 0; j < class_num_; ++j) {
float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]);
output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j];
loss += labels[i * class_num_ + j] * logit;
}
output1[i] = -loss;
}
}
bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.empty() || workspace.empty() || outputs.empty()) {
MS_LOG(EXCEPTION) << "Error input output size!";
}
size_t batch_float_size = batch_size_ * sizeof(float);
size_t batch_class_float_size = class_num_ * batch_float_size;
if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size ||
inputs[1]->size != batch_class_float_size) {
MS_LOG(EXCEPTION) << "Error input data size!";
}
if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) {
MS_LOG(EXCEPTION) << "Error output data size!";
}
SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr);
ExecutePrimitive();
auto labels = reinterpret_cast<float *>(inputs[1]->addr);
auto logits = reinterpret_cast<float *>(workspace[0]->addr);
auto output1 = reinterpret_cast<float *>(outputs[0]->addr);
auto output2 = reinterpret_cast<float *>(outputs[1]->addr);
ForwardPostExecute(logits, labels, output1, output2);
return true;
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,53 +1,53 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
namespace mindspore {
namespace kernel {
class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel {
public:
SoftmaxCrossEntropyWithLogitsCPUKernel() = default;
~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
protected:
void InitInputOutputSize(const CNodePtr &kernel_node) override;
private:
void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const;
size_t class_num_{0};
size_t batch_size_{0};
};
MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
SoftmaxCrossEntropyWithLogitsCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
namespace mindspore {
namespace kernel {
class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel {
public:
SoftmaxCrossEntropyWithLogitsCPUKernel() = default;
~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
protected:
void InitInputOutputSize(const CNodePtr &kernel_node) override;
private:
void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const;
size_t class_num_{0};
size_t batch_size_{0};
};
MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
SoftmaxCrossEntropyWithLogitsCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_

View File

@ -1,59 +1,59 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
#include <vector>
#include <memory>
#include "backend/kernel_compiler/kernel.h"
#include "ps/util.h"
namespace mindspore {
namespace kernel {
namespace ps {
using mindspore::ps::Util;
class PServerKernel {
public:
PServerKernel(size_t rank_id, size_t pserver_num, size_t worker_num)
: rank_id_(rank_id), pserver_num_(pserver_num), worker_num_(worker_num) {}
~PServerKernel() = default;
PServerKernel(const PServerKernel &) = delete;
PServerKernel &operator=(const PServerKernel &) = delete;
virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
virtual void InitKernel(const CNodePtr &cnode,
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
virtual void ReInit(const std::vector<std::vector<size_t>> &) {}
virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) = 0;
virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals,
size_t ids_size) {}
virtual const std::vector<size_t> &input_sizes() const = 0;
virtual const std::vector<size_t> &output_sizes() const = 0;
virtual const std::vector<size_t> &workspace_sizes() const = 0;
protected:
virtual void ReInit(const std::vector<AddressPtr> &) {}
void Shard(std::vector<size_t> *shape, int axis);
size_t rank_id_;
size_t pserver_num_;
size_t worker_num_;
};
} // namespace ps
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
#include <vector>
#include <memory>
#include "backend/kernel_compiler/kernel.h"
#include "ps/util.h"
namespace mindspore {
namespace kernel {
namespace ps {
using mindspore::ps::Util;
class PServerKernel {
public:
PServerKernel(size_t rank_id, size_t pserver_num, size_t worker_num)
: rank_id_(rank_id), pserver_num_(pserver_num), worker_num_(worker_num) {}
~PServerKernel() = default;
PServerKernel(const PServerKernel &) = delete;
PServerKernel &operator=(const PServerKernel &) = delete;
virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
virtual void InitKernel(const CNodePtr &cnode,
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
virtual void ReInit(const std::vector<std::vector<size_t>> &) {}
virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) = 0;
virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals,
size_t ids_size) {}
virtual const std::vector<size_t> &input_sizes() const = 0;
virtual const std::vector<size_t> &output_sizes() const = 0;
virtual const std::vector<size_t> &workspace_sizes() const = 0;
protected:
virtual void ReInit(const std::vector<AddressPtr> &) {}
void Shard(std::vector<size_t> *shape, int axis);
size_t rank_id_;
size_t pserver_num_;
size_t worker_num_;
};
} // namespace ps
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_

View File

@ -1,138 +1,138 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h"
#include <string>
#include <vector>
#include <algorithm>
#include <utility>
namespace mindspore {
namespace kernel {
template <typename T>
void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS);
if (axis_addr->isa<ValueTuple>() || axis_addr->isa<ValueList>()) {
axis_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, AXIS);
} else if (axis_addr->isa<Int64Imm>()) {
axis_.emplace_back(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
} else {
MS_LOG(EXCEPTION) << "Attribute is invalid";
}
int dimension = input_shape_.size();
std::transform(axis_.begin(), axis_.end(), axis_.begin(),
[dimension](const auto &a) { return a < 0 ? dimension + a : a; });
sort(axis_.begin(), axis_.end());
// Delete the duplicate axis.
auto last = std::unique(axis_.begin(), axis_.end());
axis_.erase(last, axis_.end());
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
if constexpr (std::is_same<T, bool>::value) {
if (kernel_name == "ReduceAll") {
reduce_type_ = kReduceAll;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; };
} else if (kernel_name == "ReduceAny") {
reduce_type_ = kReduceAny;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; };
} else {
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool.";
}
} else {
if (kernel_name == "ReduceMax") {
reduce_type_ = kReduceMax;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); };
} else if (kernel_name == "ReduceMin") {
reduce_type_ = kReduceMin;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); };
} else if (kernel_name == "ReduceSum") {
reduce_type_ = kReduceSum;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
} else if (kernel_name == "ReduceMean") {
reduce_type_ = kReduceMean;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
} else {
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name;
}
}
}
template <typename T>
bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
size_t input_size = inputs[0]->size / sizeof(T);
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) {
// Get one ret
*output_addr = input_addr[0];
for (size_t i = 1; i < input_size; ++i) {
reduce_func_(input_addr, i, output_addr);
}
if (reduce_type_ == kReduceMean) {
*output_addr /= input_size;
}
} else {
// Calculate transpose axes and stride
int dimension = input_shape_.size();
size_t stride = 1;
std::vector<size_t> axes(input_shape_.size());
size_t j = 0;
size_t k = 0;
for (int i = 0; i < dimension; ++i) {
if (j == axis_.size() || i != axis_[j]) {
axes[k] = i;
++k;
} else {
stride *= input_shape_[i];
++j;
}
}
for (auto &it : axis_) {
axes[k] = it;
++k;
}
// Calculate transpose shape
std::vector<size_t> transpose_shape(input_shape_.size());
for (int i = 0; i < dimension; ++i) {
transpose_shape[i] = input_shape_[axes[i]];
}
size_t output_size = outputs[0]->size / sizeof(T);
TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_);
auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start * stride);
for (size_t i = start; i < end; ++i) {
output_addr[i] = input_addr[iter.GetPos()];
iter.GenNextPos();
for (size_t j = 1; j < stride; ++j) {
reduce_func_(input_addr, iter.GetPos(), &output_addr[i]);
iter.GenNextPos();
}
if (reduce_type_ == kReduceMean) {
output_addr[i] /= stride;
}
}
};
CPUKernelUtils::ParallelFor(task, output_size);
}
return true;
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h"
#include <string>
#include <vector>
#include <algorithm>
#include <utility>
namespace mindspore {
namespace kernel {
template <typename T>
void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS);
if (axis_addr->isa<ValueTuple>() || axis_addr->isa<ValueList>()) {
axis_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, AXIS);
} else if (axis_addr->isa<Int64Imm>()) {
axis_.emplace_back(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
} else {
MS_LOG(EXCEPTION) << "Attribute is invalid";
}
int dimension = input_shape_.size();
std::transform(axis_.begin(), axis_.end(), axis_.begin(),
[dimension](const auto &a) { return a < 0 ? dimension + a : a; });
sort(axis_.begin(), axis_.end());
// Delete the duplicate axis.
auto last = std::unique(axis_.begin(), axis_.end());
axis_.erase(last, axis_.end());
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
if constexpr (std::is_same<T, bool>::value) {
if (kernel_name == "ReduceAll") {
reduce_type_ = kReduceAll;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; };
} else if (kernel_name == "ReduceAny") {
reduce_type_ = kReduceAny;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; };
} else {
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool.";
}
} else {
if (kernel_name == "ReduceMax") {
reduce_type_ = kReduceMax;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); };
} else if (kernel_name == "ReduceMin") {
reduce_type_ = kReduceMin;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); };
} else if (kernel_name == "ReduceSum") {
reduce_type_ = kReduceSum;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
} else if (kernel_name == "ReduceMean") {
reduce_type_ = kReduceMean;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
} else {
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name;
}
}
}
template <typename T>
bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
size_t input_size = inputs[0]->size / sizeof(T);
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) {
// Get one ret
*output_addr = input_addr[0];
for (size_t i = 1; i < input_size; ++i) {
reduce_func_(input_addr, i, output_addr);
}
if (reduce_type_ == kReduceMean) {
*output_addr /= input_size;
}
} else {
// Calculate transpose axes and stride
int dimension = input_shape_.size();
size_t stride = 1;
std::vector<size_t> axes(input_shape_.size());
size_t j = 0;
size_t k = 0;
for (int i = 0; i < dimension; ++i) {
if (j == axis_.size() || i != axis_[j]) {
axes[k] = i;
++k;
} else {
stride *= input_shape_[i];
++j;
}
}
for (auto &it : axis_) {
axes[k] = it;
++k;
}
// Calculate transpose shape
std::vector<size_t> transpose_shape(input_shape_.size());
for (int i = 0; i < dimension; ++i) {
transpose_shape[i] = input_shape_[axes[i]];
}
size_t output_size = outputs[0]->size / sizeof(T);
TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_);
auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start * stride);
for (size_t i = start; i < end; ++i) {
output_addr[i] = input_addr[iter.GetPos()];
iter.GenNextPos();
for (size_t j = 1; j < stride; ++j) {
reduce_func_(input_addr, iter.GetPos(), &output_addr[i]);
iter.GenNextPos();
}
if (reduce_type_ == kReduceMean) {
output_addr[i] /= stride;
}
}
};
CPUKernelUtils::ParallelFor(task, output_size);
}
return true;
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,69 +1,69 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include <string>
#include <functional>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
template <typename T>
class ReduceCPUKernel : public CPUKernel {
public:
ReduceCPUKernel() = default;
~ReduceCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
enum ReduceType { kReduceAll, kReduceAny, kReduceMax, kReduceMin, kReduceSum, kReduceMean };
std::vector<size_t> input_shape_;
std::vector<int64_t> axis_;
ReduceType reduce_type_{kReduceAll};
std::function<void(const T *, size_t, T *)> reduce_func_;
};
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float);
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, double);
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int32_t);
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, float);
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, double);
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int32_t);
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, float);
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, double);
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int32_t);
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, float);
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, double);
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int32_t);
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(ReduceAll, KernelAttr(), ReduceCPUKernel, bool);
MS_REG_CPU_KERNEL_T(ReduceAny, KernelAttr(), ReduceCPUKernel, bool);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include <string>
#include <functional>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
template <typename T>
class ReduceCPUKernel : public CPUKernel {
public:
ReduceCPUKernel() = default;
~ReduceCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
enum ReduceType { kReduceAll, kReduceAny, kReduceMax, kReduceMin, kReduceSum, kReduceMean };
std::vector<size_t> input_shape_;
std::vector<int64_t> axis_;
ReduceType reduce_type_{kReduceAll};
std::function<void(const T *, size_t, T *)> reduce_func_;
};
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float);
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, double);
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int32_t);
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, float);
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, double);
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int32_t);
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, float);
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, double);
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int32_t);
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, float);
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, double);
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int32_t);
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(ReduceAll, KernelAttr(), ReduceCPUKernel, bool);
MS_REG_CPU_KERNEL_T(ReduceAny, KernelAttr(), ReduceCPUKernel, bool);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_

View File

@ -1,91 +1,91 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h"
#include <vector>
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
template <typename T>
void SpaceToDepthCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
CheckParam(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
}
template <typename T>
bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> & /*workspace*/,
const std::vector<kernel::AddressPtr> &outputs) {
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
size_t size = IntToSize(inputs[0]->size / sizeof(T));
std::vector<size_t> input_shape = input_shape_;
std::vector<size_t> output_shape = output_shape_;
size_t block_size = block_size_;
size_t input_dimension = input_shape.size();
size_t input_strides[3] = {1, 1, 1};
for (size_t i = input_dimension - 1; i >= 1; --i) {
for (size_t j = 0; j < i; ++j) {
input_strides[j] *= input_shape[i];
}
}
auto task = [&, input_addr, output_addr](size_t start, size_t end) {
std::vector<size_t> input_pos_array(input_dimension, 0);
for (size_t i = start; i < end; ++i) {
size_t tmp_pos = i;
for (size_t j = 0; j < input_dimension - 1; ++j) {
input_pos_array[j] = tmp_pos / input_strides[j];
tmp_pos %= input_strides[j];
}
input_pos_array.back() = tmp_pos;
size_t output_pos = input_pos_array[0];
output_pos =
(output_pos * output_shape[1]) +
(input_pos_array[1] +
(block_size * (input_pos_array[2] % block_size) + input_pos_array[3] % block_size) * input_shape[1]);
output_pos = (output_pos * output_shape[2]) + (input_pos_array[2] / block_size);
output_pos = (output_pos * output_shape[3]) + (input_pos_array[3] / block_size);
output_addr[output_pos] = input_addr[i];
}
};
CPUKernelUtils::ParallelFor(task, size);
return true;
}
template <typename T>
void SpaceToDepthCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
}
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h"
#include <vector>
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
template <typename T>
void SpaceToDepthCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
CheckParam(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
}
template <typename T>
bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> & /*workspace*/,
const std::vector<kernel::AddressPtr> &outputs) {
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
size_t size = IntToSize(inputs[0]->size / sizeof(T));
std::vector<size_t> input_shape = input_shape_;
std::vector<size_t> output_shape = output_shape_;
size_t block_size = block_size_;
size_t input_dimension = input_shape.size();
size_t input_strides[3] = {1, 1, 1};
for (size_t i = input_dimension - 1; i >= 1; --i) {
for (size_t j = 0; j < i; ++j) {
input_strides[j] *= input_shape[i];
}
}
auto task = [&, input_addr, output_addr](size_t start, size_t end) {
std::vector<size_t> input_pos_array(input_dimension, 0);
for (size_t i = start; i < end; ++i) {
size_t tmp_pos = i;
for (size_t j = 0; j < input_dimension - 1; ++j) {
input_pos_array[j] = tmp_pos / input_strides[j];
tmp_pos %= input_strides[j];
}
input_pos_array.back() = tmp_pos;
size_t output_pos = input_pos_array[0];
output_pos =
(output_pos * output_shape[1]) +
(input_pos_array[1] +
(block_size * (input_pos_array[2] % block_size) + input_pos_array[3] % block_size) * input_shape[1]);
output_pos = (output_pos * output_shape[2]) + (input_pos_array[2] / block_size);
output_pos = (output_pos * output_shape[3]) + (input_pos_array[3] / block_size);
output_addr[output_pos] = input_addr[i];
}
};
CPUKernelUtils::ParallelFor(task, size);
return true;
}
template <typename T>
void SpaceToDepthCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
}
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,84 +1,84 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
#include <string>
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
template <typename T>
class SpaceToDepthCPUKernel : public CPUKernel {
public:
SpaceToDepthCPUKernel() = default;
~SpaceToDepthCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
void CheckParam(const CNodePtr &kernel_node);
std::vector<size_t> input_shape_;
std::vector<size_t> output_shape_;
size_t block_size_;
};
MS_REG_CPU_KERNEL_T(
SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
SpaceToDepthCPUKernel, float);
MS_REG_CPU_KERNEL_T(
SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
SpaceToDepthCPUKernel, float16);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
SpaceToDepthCPUKernel, int8_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
SpaceToDepthCPUKernel, int16_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
SpaceToDepthCPUKernel, int);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
SpaceToDepthCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
SpaceToDepthCPUKernel, uint8_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
SpaceToDepthCPUKernel, uint16_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
SpaceToDepthCPUKernel, uint32_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
SpaceToDepthCPUKernel, uint64_t);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
#include <string>
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
template <typename T>
class SpaceToDepthCPUKernel : public CPUKernel {
public:
SpaceToDepthCPUKernel() = default;
~SpaceToDepthCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
void CheckParam(const CNodePtr &kernel_node);
std::vector<size_t> input_shape_;
std::vector<size_t> output_shape_;
size_t block_size_;
};
MS_REG_CPU_KERNEL_T(
SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
SpaceToDepthCPUKernel, float);
MS_REG_CPU_KERNEL_T(
SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
SpaceToDepthCPUKernel, float16);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
SpaceToDepthCPUKernel, int8_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
SpaceToDepthCPUKernel, int16_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
SpaceToDepthCPUKernel, int);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
SpaceToDepthCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
SpaceToDepthCPUKernel, uint8_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
SpaceToDepthCPUKernel, uint16_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
SpaceToDepthCPUKernel, uint32_t);
MS_REG_CPU_KERNEL_T(SpaceToDepth,
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
SpaceToDepthCPUKernel, uint64_t);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_

View File

@ -1,87 +1,87 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string>
#include <vector>
#include <algorithm>
#include <map>
#include "backend/kernel_compiler/cpu/topk_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
template <typename T>
void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
if (inputs.size() != 2 || outputs.size() != 2) {
MS_LOG(EXCEPTION) << "TopK needs 2 inputs and 2 outputs, but get inputs: " << inputs.size()
<< "outputs: " << outputs.size();
}
if (inputs[0]->size != outer_size_ * inner_size_ * sizeof(T)) {
MS_LOG(EXCEPTION) << "Error input data size!";
}
if (inputs[1]->size != sizeof(int)) {
MS_LOG(EXCEPTION) << "Input K must be int!";
}
auto input = reinterpret_cast<T *>(inputs[0]->addr);
int k = reinterpret_cast<int *>(inputs[1]->addr)[0];
auto output = reinterpret_cast<T *>(outputs[0]->addr);
auto indices = reinterpret_cast<int *>(outputs[1]->addr);
if (k < 1) {
MS_LOG(EXCEPTION) << "Input k must > 0!";
}
size_t k_num = IntToSize(std::min<int>(inner_size_, k));
if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) {
MS_LOG(EXCEPTION) << "Error output data size!";
}
for (size_t i = 0; i < outer_size_; ++i) {
std::vector<size_t> idx(inner_size_);
auto base_input = i * inner_size_;
std::iota(idx.begin(), idx.end(), base_input);
std::stable_sort(idx.begin(), idx.end(),
[&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; });
auto base_output = i * k_num;
if (!sorted_) {
std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num));
}
for (size_t j = 0; j < k_num; ++j) {
indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input);
output[base_output + j] = input[idx[j]];
}
}
}
void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < x_shape_.size() - 1; ++i) {
outer_size_ *= x_shape_[i];
}
inner_size_ = x_shape_[x_shape_.size() - 1];
sorted_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "sorted");
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
}
bool TopKCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
}
return true;
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string>
#include <vector>
#include <algorithm>
#include <map>
#include "backend/kernel_compiler/cpu/topk_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
template <typename T>
void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
if (inputs.size() != 2 || outputs.size() != 2) {
MS_LOG(EXCEPTION) << "TopK needs 2 inputs and 2 outputs, but get inputs: " << inputs.size()
<< "outputs: " << outputs.size();
}
if (inputs[0]->size != outer_size_ * inner_size_ * sizeof(T)) {
MS_LOG(EXCEPTION) << "Error input data size!";
}
if (inputs[1]->size != sizeof(int)) {
MS_LOG(EXCEPTION) << "Input K must be int!";
}
auto input = reinterpret_cast<T *>(inputs[0]->addr);
int k = reinterpret_cast<int *>(inputs[1]->addr)[0];
auto output = reinterpret_cast<T *>(outputs[0]->addr);
auto indices = reinterpret_cast<int *>(outputs[1]->addr);
if (k < 1) {
MS_LOG(EXCEPTION) << "Input k must > 0!";
}
size_t k_num = IntToSize(std::min<int>(inner_size_, k));
if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) {
MS_LOG(EXCEPTION) << "Error output data size!";
}
for (size_t i = 0; i < outer_size_; ++i) {
std::vector<size_t> idx(inner_size_);
auto base_input = i * inner_size_;
std::iota(idx.begin(), idx.end(), base_input);
std::stable_sort(idx.begin(), idx.end(),
[&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; });
auto base_output = i * k_num;
if (!sorted_) {
std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num));
}
for (size_t j = 0; j < k_num; ++j) {
indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input);
output[base_output + j] = input[idx[j]];
}
}
}
void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < x_shape_.size() - 1; ++i) {
outer_size_ *= x_shape_[i];
}
inner_size_ = x_shape_[x_shape_.size() - 1];
sorted_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "sorted");
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
}
bool TopKCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
}
return true;
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,46 +1,46 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include <string>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class TopKCPUKernel : public CPUKernel {
public:
TopKCPUKernel() = default;
~TopKCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
size_t outer_size_{1};
size_t inner_size_{1};
bool sorted_{false};
TypeId dtype_{kTypeUnknown};
};
MS_REG_CPU_KERNEL(TopK, KernelAttr(), TopKCPUKernel)
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include <string>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class TopKCPUKernel : public CPUKernel {
public:
TopKCPUKernel() = default;
~TopKCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
size_t outer_size_{1};
size_t inner_size_{1};
bool sorted_{false};
TypeId dtype_{kTypeUnknown};
};
MS_REG_CPU_KERNEL(TopK, KernelAttr(), TopKCPUKernel)
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_

View File

@ -1,159 +1,159 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h"
#include <algorithm>
#include <vector>
#include "runtime/device/cpu/cpu_device_address.h"
#include "common/thread_pool.h"
#include "nnacl/fp32/transpose_fp32.h"
#include "nnacl/int8/transpose_int8.h"
#include "nnacl/errorcode.h"
namespace mindspore {
namespace kernel {
void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
auto tmp = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm");
axes_ = {tmp.begin(), tmp.end()};
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
if (axes_.size() > MAX_TRANSPOSE_DIM_SIZE) {
MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_TRANSPOSE_DIM_SIZE << "D, but got "
<< axes_.size() << "D.";
}
for (size_t i = 0; i < axes_.size(); ++i) {
transpose_param_.perm_[i] = SizeToInt(axes_[i]);
}
int num_axes = SizeToInt(input_shape_.size());
transpose_param_.perm_size_ = axes_.size();
transpose_param_.num_axes_ = num_axes;
transpose_param_.strides_[num_axes - 1] = 1;
transpose_param_.out_strides_[num_axes - 1] = 1;
for (int i = num_axes - 2; i >= 0; i--) {
transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1];
transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1];
}
launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel<int8_t>;
launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel<int16_t>;
launch_map_[kNumberTypeInt32] = &TransposeCPUFwdKernel::LaunchKernel<int>;
launch_map_[kNumberTypeInt64] = &TransposeCPUFwdKernel::LaunchKernel<int64_t>;
launch_map_[kNumberTypeUInt8] = &TransposeCPUFwdKernel::LaunchKernel<uint8_t>;
launch_map_[kNumberTypeUInt16] = &TransposeCPUFwdKernel::LaunchKernel<uint16_t>;
launch_map_[kNumberTypeUInt32] = &TransposeCPUFwdKernel::LaunchKernel<uint32_t>;
launch_map_[kNumberTypeUInt64] = &TransposeCPUFwdKernel::LaunchKernel<uint64_t>;
launch_map_[kNumberTypeFloat32] = &TransposeCPUFwdKernel::LaunchKernel<float>;
launch_map_[kNumberTypeBool] = &TransposeCPUFwdKernel::LaunchKernel<bool>;
auto iter = launch_map_.find(dtype_);
if (iter != launch_map_.end()) {
launch_func_ = iter->second;
} else {
MS_LOG(EXCEPTION) << "Input data type: " << dtype_ << "is not supported for Transpose kernel on CPU.";
}
}
bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
launch_func_(this, inputs, outputs);
return true;
}
template <typename T>
void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
transpose_param_.data_num_ = inputs[0]->size / sizeof(T);
int output_shape[SizeToInt(output_shape_.size())];
for (size_t i = 0; i < output_shape_.size(); ++i) {
output_shape[i] = SizeToInt(output_shape_[i]);
}
size_t data_count = (inputs[0]->size) / sizeof(T);
if (axes_.size() <= DIMENSION_6D && data_count < MAX_TRANSPOSE_SERIAL_SIZE) {
int res = NNACL_ERR;
if constexpr (std::is_same_v<T, int8_t>) {
res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, int16_t>) {
res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, int32_t>) {
res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, int64_t>) {
res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint8_t>) {
res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint16_t>) {
res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint32_t>) {
res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint64_t>) {
res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, float>) {
res = DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, bool>) {
res = DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_);
}
if (res != NNACL_OK) {
MS_LOG(ERROR) << "Transpose run failed";
}
} else {
ParallelRun(input_addr, output_addr, output_shape, data_count);
}
}
template <typename T>
void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) {
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
const float block_size = 128.0;
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
std::vector<common::Task> tasks;
std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims;
if constexpr (std::is_same_v<T, int8_t>) {
TransposeDims = &TransposeDimsInt8;
} else if constexpr (std::is_same_v<T, int16_t>) {
TransposeDims = &TransposeDimsInt16;
} else if constexpr (std::is_same_v<T, int32_t>) {
TransposeDims = &TransposeDimsInt32;
} else if constexpr (std::is_same_v<T, int64_t>) {
TransposeDims = &TransposeDimsInt64;
} else if constexpr (std::is_same_v<T, uint8_t>) {
TransposeDims = &TransposeDimsUInt8;
} else if constexpr (std::is_same_v<T, uint16_t>) {
TransposeDims = &TransposeDimsUInt16;
} else if constexpr (std::is_same_v<T, uint32_t>) {
TransposeDims = &TransposeDimsUInt32;
} else if constexpr (std::is_same_v<T, uint64_t>) {
TransposeDims = &TransposeDimsUInt64;
} else if constexpr (std::is_same_v<T, float>) {
TransposeDims = &TransposeDimsFp32;
} else if constexpr (std::is_same_v<T, bool>) {
TransposeDims = &TransposeDimsBool;
}
for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) {
auto task = [&, task_id, thread_num]() {
TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num));
return common::SUCCESS;
};
tasks.emplace_back(task);
}
common::ThreadPool::GetInstance().SyncRun(tasks);
}
} // namespace kernel
} // namespace mindspore
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h"
#include <algorithm>
#include <vector>
#include "runtime/device/cpu/cpu_device_address.h"
#include "common/thread_pool.h"
#include "nnacl/fp32/transpose_fp32.h"
#include "nnacl/int8/transpose_int8.h"
#include "nnacl/errorcode.h"
namespace mindspore {
namespace kernel {
void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
auto tmp = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm");
axes_ = {tmp.begin(), tmp.end()};
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
if (axes_.size() > MAX_TRANSPOSE_DIM_SIZE) {
MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_TRANSPOSE_DIM_SIZE << "D, but got "
<< axes_.size() << "D.";
}
for (size_t i = 0; i < axes_.size(); ++i) {
transpose_param_.perm_[i] = SizeToInt(axes_[i]);
}
int num_axes = SizeToInt(input_shape_.size());
transpose_param_.perm_size_ = axes_.size();
transpose_param_.num_axes_ = num_axes;
transpose_param_.strides_[num_axes - 1] = 1;
transpose_param_.out_strides_[num_axes - 1] = 1;
for (int i = num_axes - 2; i >= 0; i--) {
transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1];
transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1];
}
launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel<int8_t>;
launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel<int16_t>;
launch_map_[kNumberTypeInt32] = &TransposeCPUFwdKernel::LaunchKernel<int>;
launch_map_[kNumberTypeInt64] = &TransposeCPUFwdKernel::LaunchKernel<int64_t>;
launch_map_[kNumberTypeUInt8] = &TransposeCPUFwdKernel::LaunchKernel<uint8_t>;
launch_map_[kNumberTypeUInt16] = &TransposeCPUFwdKernel::LaunchKernel<uint16_t>;
launch_map_[kNumberTypeUInt32] = &TransposeCPUFwdKernel::LaunchKernel<uint32_t>;
launch_map_[kNumberTypeUInt64] = &TransposeCPUFwdKernel::LaunchKernel<uint64_t>;
launch_map_[kNumberTypeFloat32] = &TransposeCPUFwdKernel::LaunchKernel<float>;
launch_map_[kNumberTypeBool] = &TransposeCPUFwdKernel::LaunchKernel<bool>;
auto iter = launch_map_.find(dtype_);
if (iter != launch_map_.end()) {
launch_func_ = iter->second;
} else {
MS_LOG(EXCEPTION) << "Input data type: " << dtype_ << "is not supported for Transpose kernel on CPU.";
}
}
bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
launch_func_(this, inputs, outputs);
return true;
}
template <typename T>
void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
transpose_param_.data_num_ = inputs[0]->size / sizeof(T);
int output_shape[SizeToInt(output_shape_.size())];
for (size_t i = 0; i < output_shape_.size(); ++i) {
output_shape[i] = SizeToInt(output_shape_[i]);
}
size_t data_count = (inputs[0]->size) / sizeof(T);
if (axes_.size() <= DIMENSION_6D && data_count < MAX_TRANSPOSE_SERIAL_SIZE) {
int res = NNACL_ERR;
if constexpr (std::is_same_v<T, int8_t>) {
res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, int16_t>) {
res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, int32_t>) {
res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, int64_t>) {
res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint8_t>) {
res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint16_t>) {
res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint32_t>) {
res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint64_t>) {
res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, float>) {
res = DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, bool>) {
res = DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_);
}
if (res != NNACL_OK) {
MS_LOG(ERROR) << "Transpose run failed";
}
} else {
ParallelRun(input_addr, output_addr, output_shape, data_count);
}
}
template <typename T>
void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) {
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
const float block_size = 128.0;
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
std::vector<common::Task> tasks;
std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims;
if constexpr (std::is_same_v<T, int8_t>) {
TransposeDims = &TransposeDimsInt8;
} else if constexpr (std::is_same_v<T, int16_t>) {
TransposeDims = &TransposeDimsInt16;
} else if constexpr (std::is_same_v<T, int32_t>) {
TransposeDims = &TransposeDimsInt32;
} else if constexpr (std::is_same_v<T, int64_t>) {
TransposeDims = &TransposeDimsInt64;
} else if constexpr (std::is_same_v<T, uint8_t>) {
TransposeDims = &TransposeDimsUInt8;
} else if constexpr (std::is_same_v<T, uint16_t>) {
TransposeDims = &TransposeDimsUInt16;
} else if constexpr (std::is_same_v<T, uint32_t>) {
TransposeDims = &TransposeDimsUInt32;
} else if constexpr (std::is_same_v<T, uint64_t>) {
TransposeDims = &TransposeDimsUInt64;
} else if constexpr (std::is_same_v<T, float>) {
TransposeDims = &TransposeDimsFp32;
} else if constexpr (std::is_same_v<T, bool>) {
TransposeDims = &TransposeDimsBool;
}
for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) {
auto task = [&, task_id, thread_num]() {
TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num));
return common::SUCCESS;
};
tasks.emplace_back(task);
}
common::ThreadPool::GetInstance().SyncRun(tasks);
}
} // namespace kernel
} // namespace mindspore

View File

@ -1,58 +1,58 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
#include <vector>
#include <unordered_map>
#include <memory>
#include <string>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "nnacl/base/transpose_base.h"
namespace mindspore {
namespace kernel {
class TransposeCPUFwdKernel : public CPUKernel {
public:
TransposeCPUFwdKernel() = default;
~TransposeCPUFwdKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
template <typename T>
void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count);
TransposeParameter transpose_param_;
std::vector<size_t> input_shape_;
std::vector<size_t> output_shape_;
std::vector<size_t> axes_;
TypeId dtype_{kTypeUnknown};
using TypeKernel =
std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>;
std::unordered_map<TypeId, TypeKernel> launch_map_;
TypeKernel launch_func_;
};
MS_REG_CPU_KERNEL(Transpose, KernelAttr(), TransposeCPUFwdKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
#include <vector>
#include <unordered_map>
#include <memory>
#include <string>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "nnacl/base/transpose_base.h"
namespace mindspore {
namespace kernel {
class TransposeCPUFwdKernel : public CPUKernel {
public:
TransposeCPUFwdKernel() = default;
~TransposeCPUFwdKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
template <typename T>
void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count);
TransposeParameter transpose_param_;
std::vector<size_t> input_shape_;
std::vector<size_t> output_shape_;
std::vector<size_t> axes_;
TypeId dtype_{kTypeUnknown};
using TypeKernel =
std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>;
std::unordered_map<TypeId, TypeKernel> launch_map_;
TypeKernel launch_func_;
};
MS_REG_CPU_KERNEL(Transpose, KernelAttr(), TransposeCPUFwdKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_