change from CRLF to LF
This commit is contained in:
parent
85e20508eb
commit
b3d4399d32
|
@ -1,116 +1,116 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t kBceInputNumWithWeight = 3;
|
||||
|
||||
template <typename T>
|
||||
void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) {
|
||||
if (input_size % 2 == 1) {
|
||||
tmp_loss[0] += tmp_loss[input_size - 1];
|
||||
}
|
||||
|
||||
for (int stride = input_size / 2; stride > 0; stride = stride / 2) {
|
||||
for (int i = 0; i < stride; i++) {
|
||||
tmp_loss[i] += tmp_loss[i + stride];
|
||||
}
|
||||
if (stride > 2 && stride % 2 == 1) {
|
||||
tmp_loss[0] += tmp_loss[stride - 1];
|
||||
}
|
||||
}
|
||||
|
||||
loss[0] += tmp_loss[0];
|
||||
if (reduction == 1) {
|
||||
loss[0] /= static_cast<T>(input_size);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
|
||||
T *weight = nullptr;
|
||||
if (weight_defined_) {
|
||||
weight = reinterpret_cast<T *>(inputs[2]->addr);
|
||||
}
|
||||
T *loss = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
std::vector<T> tmp_loss(input_size_);
|
||||
|
||||
T epsilon = static_cast<T>(1e-12);
|
||||
T one = static_cast<T>(1);
|
||||
if (reduction_ == 0 && weight_defined_) {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T value =
|
||||
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
|
||||
loss[i] = value;
|
||||
}
|
||||
} else if (reduction_ == 0 && (!weight_defined_)) {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
|
||||
loss[i] = value;
|
||||
}
|
||||
} else if ((reduction_ != 0) && weight_defined_) {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T value =
|
||||
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
|
||||
tmp_loss[i] = value;
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
|
||||
tmp_loss[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
if (reduction_ != 0) {
|
||||
LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data());
|
||||
}
|
||||
}
|
||||
|
||||
bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
if (input_size_ > 0) {
|
||||
if (dtype_ == kNumberTypeFloat32) {
|
||||
Launchkernel<float>(inputs, workspace, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat16) {
|
||||
Launchkernel<float16>(inputs, workspace, outputs);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
for (size_t i = 0; i < input_shape.size(); i++) {
|
||||
input_size_ *= input_shape[i];
|
||||
}
|
||||
string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
|
||||
if (reduction == "none") {
|
||||
reduction_ = 0;
|
||||
} else if (reduction == "sum") {
|
||||
reduction_ = 2;
|
||||
}
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
weight_defined_ = (input_num == kBceInputNumWithWeight);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t kBceInputNumWithWeight = 3;
|
||||
|
||||
template <typename T>
|
||||
void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) {
|
||||
if (input_size % 2 == 1) {
|
||||
tmp_loss[0] += tmp_loss[input_size - 1];
|
||||
}
|
||||
|
||||
for (int stride = input_size / 2; stride > 0; stride = stride / 2) {
|
||||
for (int i = 0; i < stride; i++) {
|
||||
tmp_loss[i] += tmp_loss[i + stride];
|
||||
}
|
||||
if (stride > 2 && stride % 2 == 1) {
|
||||
tmp_loss[0] += tmp_loss[stride - 1];
|
||||
}
|
||||
}
|
||||
|
||||
loss[0] += tmp_loss[0];
|
||||
if (reduction == 1) {
|
||||
loss[0] /= static_cast<T>(input_size);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
|
||||
T *weight = nullptr;
|
||||
if (weight_defined_) {
|
||||
weight = reinterpret_cast<T *>(inputs[2]->addr);
|
||||
}
|
||||
T *loss = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
std::vector<T> tmp_loss(input_size_);
|
||||
|
||||
T epsilon = static_cast<T>(1e-12);
|
||||
T one = static_cast<T>(1);
|
||||
if (reduction_ == 0 && weight_defined_) {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T value =
|
||||
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
|
||||
loss[i] = value;
|
||||
}
|
||||
} else if (reduction_ == 0 && (!weight_defined_)) {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
|
||||
loss[i] = value;
|
||||
}
|
||||
} else if ((reduction_ != 0) && weight_defined_) {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T value =
|
||||
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
|
||||
tmp_loss[i] = value;
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
|
||||
tmp_loss[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
if (reduction_ != 0) {
|
||||
LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data());
|
||||
}
|
||||
}
|
||||
|
||||
bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
if (input_size_ > 0) {
|
||||
if (dtype_ == kNumberTypeFloat32) {
|
||||
Launchkernel<float>(inputs, workspace, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat16) {
|
||||
Launchkernel<float16>(inputs, workspace, outputs);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
for (size_t i = 0; i < input_shape.size(); i++) {
|
||||
input_size_ *= input_shape[i];
|
||||
}
|
||||
string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
|
||||
if (reduction == "none") {
|
||||
reduction_ = 0;
|
||||
} else if (reduction == "sum") {
|
||||
reduction_ = 2;
|
||||
}
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
weight_defined_ = (input_num == kBceInputNumWithWeight);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,71 +1,71 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class BinaryCrossEntropyCpuKernel : public CPUKernel {
|
||||
public:
|
||||
BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
|
||||
~BinaryCrossEntropyCpuKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss);
|
||||
template <typename T>
|
||||
void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs);
|
||||
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
size_t input_size_;
|
||||
int reduction_;
|
||||
bool weight_defined_; // true: there are 3 inputs, false: there are 2 inputs(no [weight])
|
||||
};
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropy,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
BinaryCrossEntropyCpuKernel);
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropy,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
BinaryCrossEntropyCpuKernel);
|
||||
MS_REG_CPU_KERNEL(
|
||||
BinaryCrossEntropy,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
BinaryCrossEntropyCpuKernel);
|
||||
MS_REG_CPU_KERNEL(
|
||||
BinaryCrossEntropy,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
BinaryCrossEntropyCpuKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class BinaryCrossEntropyCpuKernel : public CPUKernel {
|
||||
public:
|
||||
BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
|
||||
~BinaryCrossEntropyCpuKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss);
|
||||
template <typename T>
|
||||
void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs);
|
||||
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
size_t input_size_;
|
||||
int reduction_;
|
||||
bool weight_defined_; // true: there are 3 inputs, false: there are 2 inputs(no [weight])
|
||||
};
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropy,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
BinaryCrossEntropyCpuKernel);
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropy,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
BinaryCrossEntropyCpuKernel);
|
||||
MS_REG_CPU_KERNEL(
|
||||
BinaryCrossEntropy,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
BinaryCrossEntropyCpuKernel);
|
||||
MS_REG_CPU_KERNEL(
|
||||
BinaryCrossEntropy,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
BinaryCrossEntropyCpuKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
|
||||
|
|
|
@ -1,102 +1,102 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t kBceGradInputNumWithWeight = 4;
|
||||
|
||||
template <typename T>
|
||||
void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
|
||||
T *dloss = reinterpret_cast<T *>(inputs[2]->addr);
|
||||
T *weight = nullptr;
|
||||
if (weight_defined_) {
|
||||
weight = reinterpret_cast<T *>(inputs[3]->addr);
|
||||
}
|
||||
|
||||
T *dx = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
|
||||
T epsilon = static_cast<T>(1e-12);
|
||||
T one = static_cast<T>(1);
|
||||
if (reduction_ == 0) {
|
||||
if (weight_defined_) {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
|
||||
T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
|
||||
dx[i] = value * dloss[i];
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
|
||||
T value = (input_x[i] - input_y[i]) / denominator;
|
||||
dx[i] = value * dloss[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
T dloss1 = dloss[0];
|
||||
if (reduction_ == 1) {
|
||||
dloss1 = dloss[0] / static_cast<T>(input_size_);
|
||||
}
|
||||
if (weight_defined_) {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
|
||||
T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
|
||||
dx[i] = value * dloss1;
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
|
||||
T value = (input_x[i] - input_y[i]) / denominator;
|
||||
dx[i] = value * dloss1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
if (input_size_ > 0) {
|
||||
if (dtype_ == kNumberTypeFloat32) {
|
||||
Launchkernel<float>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat16) {
|
||||
Launchkernel<float16>(inputs, outputs);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
for (size_t i = 0; i < input_shape.size(); i++) {
|
||||
input_size_ *= input_shape[i];
|
||||
}
|
||||
string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
|
||||
if (reduction == "none") {
|
||||
reduction_ = 0;
|
||||
} else if (reduction == "sum") {
|
||||
reduction_ = 2;
|
||||
}
|
||||
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
weight_defined_ = (input_num == kBceGradInputNumWithWeight);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t kBceGradInputNumWithWeight = 4;
|
||||
|
||||
template <typename T>
|
||||
void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
|
||||
T *dloss = reinterpret_cast<T *>(inputs[2]->addr);
|
||||
T *weight = nullptr;
|
||||
if (weight_defined_) {
|
||||
weight = reinterpret_cast<T *>(inputs[3]->addr);
|
||||
}
|
||||
|
||||
T *dx = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
|
||||
T epsilon = static_cast<T>(1e-12);
|
||||
T one = static_cast<T>(1);
|
||||
if (reduction_ == 0) {
|
||||
if (weight_defined_) {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
|
||||
T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
|
||||
dx[i] = value * dloss[i];
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
|
||||
T value = (input_x[i] - input_y[i]) / denominator;
|
||||
dx[i] = value * dloss[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
T dloss1 = dloss[0];
|
||||
if (reduction_ == 1) {
|
||||
dloss1 = dloss[0] / static_cast<T>(input_size_);
|
||||
}
|
||||
if (weight_defined_) {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
|
||||
T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
|
||||
dx[i] = value * dloss1;
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < input_size_; i++) {
|
||||
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
|
||||
T value = (input_x[i] - input_y[i]) / denominator;
|
||||
dx[i] = value * dloss1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
if (input_size_ > 0) {
|
||||
if (dtype_ == kNumberTypeFloat32) {
|
||||
Launchkernel<float>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat16) {
|
||||
Launchkernel<float16>(inputs, outputs);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
for (size_t i = 0; i < input_shape.size(); i++) {
|
||||
input_size_ *= input_shape[i];
|
||||
}
|
||||
string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
|
||||
if (reduction == "none") {
|
||||
reduction_ = 0;
|
||||
} else if (reduction == "sum") {
|
||||
reduction_ = 2;
|
||||
}
|
||||
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
weight_defined_ = (input_num == kBceGradInputNumWithWeight);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,76 +1,76 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class BinaryCrossEntropyGradCpuKernel : public CPUKernel {
|
||||
public:
|
||||
BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
|
||||
~BinaryCrossEntropyGradCpuKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
size_t input_size_;
|
||||
int reduction_;
|
||||
bool weight_defined_; // true: there are 4 inputs, false: there are 3 inputs(no [weight])
|
||||
};
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
BinaryCrossEntropyGradCpuKernel);
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
BinaryCrossEntropyGradCpuKernel);
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
BinaryCrossEntropyGradCpuKernel);
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
BinaryCrossEntropyGradCpuKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class BinaryCrossEntropyGradCpuKernel : public CPUKernel {
|
||||
public:
|
||||
BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
|
||||
~BinaryCrossEntropyGradCpuKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
size_t input_size_;
|
||||
int reduction_;
|
||||
bool weight_defined_; // true: there are 4 inputs, false: there are 3 inputs(no [weight])
|
||||
};
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
BinaryCrossEntropyGradCpuKernel);
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
BinaryCrossEntropyGradCpuKernel);
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
BinaryCrossEntropyGradCpuKernel);
|
||||
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
BinaryCrossEntropyGradCpuKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
|
||||
|
|
|
@ -1,271 +1,271 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include "common/thread_pool.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
for (size_t input_index = 0; input_index < input_num; ++input_index) {
|
||||
TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index);
|
||||
size_t type_size = GetTypeByte(TypeIdToType(type_id));
|
||||
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index);
|
||||
size_t tensor_size =
|
||||
shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
|
||||
tensor_size = std::max(tensor_size, type_size);
|
||||
input_size_list_.emplace_back(tensor_size);
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
for (size_t output_index = 0; output_index < output_num; ++output_index) {
|
||||
TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index);
|
||||
size_t type_size = GetTypeByte(TypeIdToType(type_id));
|
||||
std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index);
|
||||
size_t tensor_size =
|
||||
shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
|
||||
tensor_size = std::max(tensor_size, type_size);
|
||||
output_size_list_.emplace_back(tensor_size);
|
||||
}
|
||||
}
|
||||
|
||||
void CPUKernel::Init(const CNodePtr &kernel_node) {
|
||||
InitKernel(kernel_node);
|
||||
InitInputOutputSize(kernel_node);
|
||||
}
|
||||
|
||||
void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
|
||||
auto len = shape->size();
|
||||
if (len < 4) {
|
||||
for (size_t i = 0; i < 4 - len; ++i) {
|
||||
shape->insert(shape->begin(), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2,
|
||||
size_t dim3) {
|
||||
size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3;
|
||||
return offset;
|
||||
}
|
||||
|
||||
size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) {
|
||||
if (axis < 0) {
|
||||
axis = axis + SizeToInt(shape.size());
|
||||
}
|
||||
size_t result = 1;
|
||||
for (int j = 3; j > axis; --j) {
|
||||
result *= shape[j];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
|
||||
size_t accumulation = 1;
|
||||
element_num->emplace_back(1);
|
||||
for (size_t i = shape.size() - 1; i > 0; --i) {
|
||||
accumulation *= shape[i];
|
||||
element_num->emplace_back(accumulation);
|
||||
}
|
||||
std::reverse(element_num->begin(), element_num->end());
|
||||
}
|
||||
|
||||
void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) {
|
||||
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
|
||||
const float block_size = 128.0;
|
||||
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
|
||||
std::vector<common::Task> tasks;
|
||||
size_t start = 0;
|
||||
size_t once_compute_size = (count + thread_num - 1) / thread_num;
|
||||
while (start < count) {
|
||||
size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
|
||||
auto block = [&, start, end]() {
|
||||
task(start, end);
|
||||
return common::SUCCESS;
|
||||
};
|
||||
tasks.emplace_back(block);
|
||||
start += once_compute_size;
|
||||
}
|
||||
common::ThreadPool::GetInstance().SyncRun(tasks);
|
||||
}
|
||||
|
||||
std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) {
|
||||
if (axis < 0) {
|
||||
axis = axis + SizeToInt(shape.size());
|
||||
}
|
||||
size_t dim_row = 1;
|
||||
size_t dim_col = 1;
|
||||
std::vector<size_t> flat_shape;
|
||||
for (size_t i = 0; i < shape.size(); ++i) {
|
||||
if (SizeToInt(i) < axis) {
|
||||
dim_row *= shape[i];
|
||||
} else {
|
||||
dim_col *= shape[i];
|
||||
}
|
||||
}
|
||||
flat_shape.push_back(dim_row);
|
||||
flat_shape.push_back(dim_col);
|
||||
return flat_shape;
|
||||
}
|
||||
|
||||
BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
|
||||
std::vector<size_t> output_shape)
|
||||
: input_shape_a_(std::move(input_shape_a)),
|
||||
input_shape_b_(std::move(input_shape_b)),
|
||||
output_shape_(std::move(output_shape)) {
|
||||
output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator
|
||||
BroadcastShape();
|
||||
// Allocate strides memory
|
||||
input_strides_a_.resize(output_dimension_);
|
||||
input_strides_b_.resize(output_dimension_);
|
||||
input_back_strides_a_.resize(output_dimension_);
|
||||
input_back_strides_b_.resize(output_dimension_);
|
||||
coordinates_.resize(output_dimension_);
|
||||
InitStrides();
|
||||
}
|
||||
|
||||
void BroadcastIterator::SetPos(size_t pos) {
|
||||
for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
|
||||
coordinates_[i] = pos % output_shape_[i];
|
||||
input_pos_[0] += coordinates_[i] * input_strides_a_[i];
|
||||
input_pos_[1] += coordinates_[i] * input_strides_b_[i];
|
||||
pos /= output_shape_[i];
|
||||
}
|
||||
}
|
||||
|
||||
void BroadcastIterator::GenNextPos() {
|
||||
// Calculate output next coordinate
|
||||
for (int i = output_dimension_ - 1; i >= 0; --i) {
|
||||
if (coordinates_[i] + 1 == output_shape_[i]) {
|
||||
coordinates_[i] = 0;
|
||||
input_pos_[0] -= input_back_strides_a_[i];
|
||||
input_pos_[1] -= input_back_strides_b_[i];
|
||||
} else {
|
||||
++coordinates_[i];
|
||||
input_pos_[0] += input_strides_a_[i];
|
||||
input_pos_[1] += input_strides_b_[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BroadcastIterator::BroadcastShape() {
|
||||
int input_dimension_a = input_shape_a_.size();
|
||||
if (input_dimension_a < output_dimension_) {
|
||||
input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1);
|
||||
}
|
||||
|
||||
int input_dimension_b = input_shape_b_.size();
|
||||
if (input_dimension_b < output_dimension_) {
|
||||
input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void BroadcastIterator::InitStrides() {
|
||||
input_strides_a_[output_dimension_ - 1] = 1;
|
||||
input_strides_b_[output_dimension_ - 1] = 1;
|
||||
for (int i = output_dimension_ - 2; i >= 0; --i) {
|
||||
input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
|
||||
input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
|
||||
input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
|
||||
input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
|
||||
}
|
||||
|
||||
// Update strides for broadcast
|
||||
// While the axis value is 1, the stride is 0
|
||||
std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(),
|
||||
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
|
||||
std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(),
|
||||
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
|
||||
}
|
||||
|
||||
TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,
|
||||
const std::vector<size_t> &input_shape)
|
||||
: shape_(std::move(output_shape)), axes_(std::move(axes)) {
|
||||
// Calculate strides
|
||||
dimension_ = shape_.size();
|
||||
std::vector<uint32_t> strides(dimension_, 1);
|
||||
for (int i = dimension_ - 2; i >= 0; --i) {
|
||||
strides[i] = input_shape[i + 1] * strides[i + 1];
|
||||
}
|
||||
|
||||
// Swap shape ans strides and calculate back strides
|
||||
strides_.resize(dimension_);
|
||||
back_strides_.resize(dimension_);
|
||||
for (int i = dimension_ - 1; i >= 0; --i) {
|
||||
strides_[i] = strides[axes_[i]];
|
||||
back_strides_[i] = (shape_[i] - 1) * strides_[i];
|
||||
}
|
||||
|
||||
// Calculate coordinate by pos
|
||||
coordinates_.resize(dimension_);
|
||||
}
|
||||
|
||||
void TransposeIterator::SetPos(size_t pos) {
|
||||
for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
|
||||
coordinates_[i] = pos % shape_[i];
|
||||
pos_ += coordinates_[i] * strides_[i];
|
||||
pos /= shape_[i];
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeIterator::GenNextPos() {
|
||||
for (int i = dimension_ - 1; i >= 0; --i) {
|
||||
if (coordinates_[i] + 1 == shape_[i]) {
|
||||
coordinates_[i] = 0;
|
||||
pos_ -= back_strides_[i];
|
||||
} else {
|
||||
coordinates_[i]++;
|
||||
pos_ += strides_[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<size_t> CPUKernelUtils::GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y) {
|
||||
size_t x_len = x.size();
|
||||
size_t y_len = y.size();
|
||||
size_t length = x_len < y_len ? x_len : y_len;
|
||||
std::vector<size_t> broadcast_shape;
|
||||
std::vector<size_t> broadcast_shape_back;
|
||||
for (int i = -length; i < 0; ++i) {
|
||||
if (x[x_len + i] == 1) {
|
||||
broadcast_shape_back.push_back(y[y_len + i]);
|
||||
} else if (y[y_len + i] == 1) {
|
||||
broadcast_shape_back.push_back(x[x_len + i]);
|
||||
} else if (x[x_len + i] == y[y_len + i]) {
|
||||
broadcast_shape_back.push_back(x[x_len + i]);
|
||||
}
|
||||
}
|
||||
if (length == x_len) {
|
||||
for (size_t i = 0; i < y_len - length; ++i) {
|
||||
broadcast_shape.push_back(y[i]);
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < x_len - length; ++i) {
|
||||
broadcast_shape.push_back(x[i]);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
broadcast_shape.push_back(broadcast_shape_back[i]);
|
||||
}
|
||||
return broadcast_shape;
|
||||
}
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include "common/thread_pool.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
for (size_t input_index = 0; input_index < input_num; ++input_index) {
|
||||
TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index);
|
||||
size_t type_size = GetTypeByte(TypeIdToType(type_id));
|
||||
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index);
|
||||
size_t tensor_size =
|
||||
shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
|
||||
tensor_size = std::max(tensor_size, type_size);
|
||||
input_size_list_.emplace_back(tensor_size);
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
for (size_t output_index = 0; output_index < output_num; ++output_index) {
|
||||
TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index);
|
||||
size_t type_size = GetTypeByte(TypeIdToType(type_id));
|
||||
std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index);
|
||||
size_t tensor_size =
|
||||
shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
|
||||
tensor_size = std::max(tensor_size, type_size);
|
||||
output_size_list_.emplace_back(tensor_size);
|
||||
}
|
||||
}
|
||||
|
||||
void CPUKernel::Init(const CNodePtr &kernel_node) {
|
||||
InitKernel(kernel_node);
|
||||
InitInputOutputSize(kernel_node);
|
||||
}
|
||||
|
||||
void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
|
||||
auto len = shape->size();
|
||||
if (len < 4) {
|
||||
for (size_t i = 0; i < 4 - len; ++i) {
|
||||
shape->insert(shape->begin(), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2,
|
||||
size_t dim3) {
|
||||
size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3;
|
||||
return offset;
|
||||
}
|
||||
|
||||
size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) {
|
||||
if (axis < 0) {
|
||||
axis = axis + SizeToInt(shape.size());
|
||||
}
|
||||
size_t result = 1;
|
||||
for (int j = 3; j > axis; --j) {
|
||||
result *= shape[j];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
|
||||
size_t accumulation = 1;
|
||||
element_num->emplace_back(1);
|
||||
for (size_t i = shape.size() - 1; i > 0; --i) {
|
||||
accumulation *= shape[i];
|
||||
element_num->emplace_back(accumulation);
|
||||
}
|
||||
std::reverse(element_num->begin(), element_num->end());
|
||||
}
|
||||
|
||||
void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) {
|
||||
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
|
||||
const float block_size = 128.0;
|
||||
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
|
||||
std::vector<common::Task> tasks;
|
||||
size_t start = 0;
|
||||
size_t once_compute_size = (count + thread_num - 1) / thread_num;
|
||||
while (start < count) {
|
||||
size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
|
||||
auto block = [&, start, end]() {
|
||||
task(start, end);
|
||||
return common::SUCCESS;
|
||||
};
|
||||
tasks.emplace_back(block);
|
||||
start += once_compute_size;
|
||||
}
|
||||
common::ThreadPool::GetInstance().SyncRun(tasks);
|
||||
}
|
||||
|
||||
std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) {
|
||||
if (axis < 0) {
|
||||
axis = axis + SizeToInt(shape.size());
|
||||
}
|
||||
size_t dim_row = 1;
|
||||
size_t dim_col = 1;
|
||||
std::vector<size_t> flat_shape;
|
||||
for (size_t i = 0; i < shape.size(); ++i) {
|
||||
if (SizeToInt(i) < axis) {
|
||||
dim_row *= shape[i];
|
||||
} else {
|
||||
dim_col *= shape[i];
|
||||
}
|
||||
}
|
||||
flat_shape.push_back(dim_row);
|
||||
flat_shape.push_back(dim_col);
|
||||
return flat_shape;
|
||||
}
|
||||
|
||||
BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
|
||||
std::vector<size_t> output_shape)
|
||||
: input_shape_a_(std::move(input_shape_a)),
|
||||
input_shape_b_(std::move(input_shape_b)),
|
||||
output_shape_(std::move(output_shape)) {
|
||||
output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator
|
||||
BroadcastShape();
|
||||
// Allocate strides memory
|
||||
input_strides_a_.resize(output_dimension_);
|
||||
input_strides_b_.resize(output_dimension_);
|
||||
input_back_strides_a_.resize(output_dimension_);
|
||||
input_back_strides_b_.resize(output_dimension_);
|
||||
coordinates_.resize(output_dimension_);
|
||||
InitStrides();
|
||||
}
|
||||
|
||||
void BroadcastIterator::SetPos(size_t pos) {
|
||||
for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
|
||||
coordinates_[i] = pos % output_shape_[i];
|
||||
input_pos_[0] += coordinates_[i] * input_strides_a_[i];
|
||||
input_pos_[1] += coordinates_[i] * input_strides_b_[i];
|
||||
pos /= output_shape_[i];
|
||||
}
|
||||
}
|
||||
|
||||
void BroadcastIterator::GenNextPos() {
|
||||
// Calculate output next coordinate
|
||||
for (int i = output_dimension_ - 1; i >= 0; --i) {
|
||||
if (coordinates_[i] + 1 == output_shape_[i]) {
|
||||
coordinates_[i] = 0;
|
||||
input_pos_[0] -= input_back_strides_a_[i];
|
||||
input_pos_[1] -= input_back_strides_b_[i];
|
||||
} else {
|
||||
++coordinates_[i];
|
||||
input_pos_[0] += input_strides_a_[i];
|
||||
input_pos_[1] += input_strides_b_[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BroadcastIterator::BroadcastShape() {
|
||||
int input_dimension_a = input_shape_a_.size();
|
||||
if (input_dimension_a < output_dimension_) {
|
||||
input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1);
|
||||
}
|
||||
|
||||
int input_dimension_b = input_shape_b_.size();
|
||||
if (input_dimension_b < output_dimension_) {
|
||||
input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void BroadcastIterator::InitStrides() {
|
||||
input_strides_a_[output_dimension_ - 1] = 1;
|
||||
input_strides_b_[output_dimension_ - 1] = 1;
|
||||
for (int i = output_dimension_ - 2; i >= 0; --i) {
|
||||
input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
|
||||
input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
|
||||
input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
|
||||
input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
|
||||
}
|
||||
|
||||
// Update strides for broadcast
|
||||
// While the axis value is 1, the stride is 0
|
||||
std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(),
|
||||
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
|
||||
std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(),
|
||||
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
|
||||
}
|
||||
|
||||
TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,
|
||||
const std::vector<size_t> &input_shape)
|
||||
: shape_(std::move(output_shape)), axes_(std::move(axes)) {
|
||||
// Calculate strides
|
||||
dimension_ = shape_.size();
|
||||
std::vector<uint32_t> strides(dimension_, 1);
|
||||
for (int i = dimension_ - 2; i >= 0; --i) {
|
||||
strides[i] = input_shape[i + 1] * strides[i + 1];
|
||||
}
|
||||
|
||||
// Swap shape ans strides and calculate back strides
|
||||
strides_.resize(dimension_);
|
||||
back_strides_.resize(dimension_);
|
||||
for (int i = dimension_ - 1; i >= 0; --i) {
|
||||
strides_[i] = strides[axes_[i]];
|
||||
back_strides_[i] = (shape_[i] - 1) * strides_[i];
|
||||
}
|
||||
|
||||
// Calculate coordinate by pos
|
||||
coordinates_.resize(dimension_);
|
||||
}
|
||||
|
||||
void TransposeIterator::SetPos(size_t pos) {
|
||||
for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
|
||||
coordinates_[i] = pos % shape_[i];
|
||||
pos_ += coordinates_[i] * strides_[i];
|
||||
pos /= shape_[i];
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeIterator::GenNextPos() {
|
||||
for (int i = dimension_ - 1; i >= 0; --i) {
|
||||
if (coordinates_[i] + 1 == shape_[i]) {
|
||||
coordinates_[i] = 0;
|
||||
pos_ -= back_strides_[i];
|
||||
} else {
|
||||
coordinates_[i]++;
|
||||
pos_ += strides_[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<size_t> CPUKernelUtils::GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y) {
|
||||
size_t x_len = x.size();
|
||||
size_t y_len = y.size();
|
||||
size_t length = x_len < y_len ? x_len : y_len;
|
||||
std::vector<size_t> broadcast_shape;
|
||||
std::vector<size_t> broadcast_shape_back;
|
||||
for (int i = -length; i < 0; ++i) {
|
||||
if (x[x_len + i] == 1) {
|
||||
broadcast_shape_back.push_back(y[y_len + i]);
|
||||
} else if (y[y_len + i] == 1) {
|
||||
broadcast_shape_back.push_back(x[x_len + i]);
|
||||
} else if (x[x_len + i] == y[y_len + i]) {
|
||||
broadcast_shape_back.push_back(x[x_len + i]);
|
||||
}
|
||||
}
|
||||
if (length == x_len) {
|
||||
for (size_t i = 0; i < y_len - length; ++i) {
|
||||
broadcast_shape.push_back(y[i]);
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < x_len - length; ++i) {
|
||||
broadcast_shape.push_back(x[i]);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
broadcast_shape.push_back(broadcast_shape_back[i]);
|
||||
}
|
||||
return broadcast_shape;
|
||||
}
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,205 +1,205 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "backend/kernel_compiler/kernel.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "backend/kernel_compiler/common_utils.h"
|
||||
#include "ir/anf.h"
|
||||
|
||||
using mindspore::kernel::Address;
|
||||
using mindspore::kernel::AddressPtr;
|
||||
using CTask = std::function<void(size_t, size_t)>;
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
const char KERNEL_SIZE[] = "kernel_size";
|
||||
const char STRIDE[] = "stride";
|
||||
const char STRIDES[] = "strides";
|
||||
const char DILATION[] = "dilation";
|
||||
const char DILATIONS[] = "dilations";
|
||||
const char FORMAT[] = "format";
|
||||
const char PAD[] = "pad";
|
||||
const char PAD_LIST[] = "pad_list";
|
||||
const char PAD_MODE[] = "pad_mode";
|
||||
const char PAD_MODE_LOWER_SAME[] = "same";
|
||||
const char PAD_MODE_LOWER_VALID[] = "valid";
|
||||
const char PAD_MODE_UPPER_SAME[] = "SAME";
|
||||
const char PAD_MODE_UPPER_VALID[] = "VALID";
|
||||
const char TRANSPOSE_A[] = "transpose_a";
|
||||
const char TRANSPOSE_B[] = "transpose_b";
|
||||
const char IS_GRAD[] = "is_grad";
|
||||
const char TRANSPOSE_NO = 'N';
|
||||
const char TRANSPOSE_YES = 'T';
|
||||
const char AXIS[] = "axis";
|
||||
const char DIM[] = "dim";
|
||||
const char BEGIN[] = "begin";
|
||||
const char END[] = "end";
|
||||
const char SIZE[] = "size";
|
||||
const char USE_NESTEROV[] = "use_nesterov";
|
||||
const char GROUP[] = "group";
|
||||
const char START[] = "start";
|
||||
const char LIMIT[] = "limit";
|
||||
const char DELTA[] = "delta";
|
||||
const char SORTED[] = "sorted";
|
||||
const char ADJ_ST[] = "adjoint_st";
|
||||
const char ADJ_dT[] = "adjoint_dt";
|
||||
|
||||
enum OperateType {
|
||||
ADD = 0,
|
||||
SUB,
|
||||
MUL,
|
||||
DIV,
|
||||
SQUARE,
|
||||
SQRT,
|
||||
POW,
|
||||
REALDIV,
|
||||
FLOORDIV,
|
||||
MOD,
|
||||
FLOORMOD,
|
||||
NEG,
|
||||
LESS,
|
||||
ASSIGNADD,
|
||||
RELUGRAD,
|
||||
RELU6GRAD,
|
||||
ABSGRAD,
|
||||
TANHGRAD,
|
||||
SQRTGRAD,
|
||||
SIGMOIDGRAD,
|
||||
ONESLIKE,
|
||||
ZEROSLIKE,
|
||||
SIGN,
|
||||
EQUAL,
|
||||
NOTEQUAL,
|
||||
LESSEQUAL,
|
||||
LOGICALAND,
|
||||
LOGICALOR,
|
||||
LOGICALNOT,
|
||||
FLOOR,
|
||||
SQUAREDDIFFERENCE,
|
||||
GREATER,
|
||||
GREATEREQUAL,
|
||||
RECIPROCAL,
|
||||
GELU,
|
||||
GELUGRAD,
|
||||
ASIN,
|
||||
ACOS,
|
||||
ATAN,
|
||||
ASINGRAD,
|
||||
ACOSGRAD,
|
||||
ATANGRAD,
|
||||
SIN,
|
||||
COS,
|
||||
TAN,
|
||||
SINH,
|
||||
COSH,
|
||||
ASINH,
|
||||
ACOSH,
|
||||
ATANH,
|
||||
ASINHGRAD,
|
||||
ACOSHGRAD,
|
||||
ATAN2,
|
||||
RINT,
|
||||
ROUND,
|
||||
IDENTITY,
|
||||
};
|
||||
|
||||
class CPUKernel : public kernel::KernelMod {
|
||||
public:
|
||||
CPUKernel() = default;
|
||||
~CPUKernel() override = default;
|
||||
virtual void Init(const CNodePtr &kernel_node);
|
||||
virtual void InitKernel(const CNodePtr &kernel_node) = 0;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override {
|
||||
return Launch(inputs, workspace, outputs);
|
||||
};
|
||||
virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) = 0;
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
|
||||
|
||||
protected:
|
||||
virtual void InitInputOutputSize(const CNodePtr &kernel_node);
|
||||
std::vector<size_t> input_size_list_;
|
||||
std::vector<size_t> output_size_list_;
|
||||
std::vector<size_t> workspace_size_list_;
|
||||
};
|
||||
|
||||
class CPUKernelUtils {
|
||||
public:
|
||||
static void ExpandDimsTo4(std::vector<size_t> *shape);
|
||||
static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
|
||||
static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
|
||||
static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
|
||||
static void ParallelFor(const CTask &task, size_t count);
|
||||
static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis);
|
||||
static std::vector<size_t> GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y);
|
||||
};
|
||||
|
||||
class BroadcastIterator {
|
||||
public:
|
||||
BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
|
||||
std::vector<size_t> output_shape);
|
||||
virtual ~BroadcastIterator() = default;
|
||||
inline size_t GetInputPosA() const { return input_pos_[0]; }
|
||||
inline size_t GetInputPosB() const { return input_pos_[1]; }
|
||||
void SetPos(size_t pos);
|
||||
void GenNextPos();
|
||||
|
||||
private:
|
||||
void BroadcastShape();
|
||||
void InitStrides();
|
||||
|
||||
std::vector<size_t> coordinates_;
|
||||
std::vector<size_t> input_shape_a_;
|
||||
std::vector<size_t> input_shape_b_;
|
||||
std::vector<size_t> output_shape_;
|
||||
std::vector<size_t> input_strides_a_;
|
||||
std::vector<size_t> input_strides_b_;
|
||||
std::vector<size_t> input_back_strides_a_;
|
||||
std::vector<size_t> input_back_strides_b_;
|
||||
std::array<size_t, 2> input_pos_{0};
|
||||
int output_dimension_{0};
|
||||
};
|
||||
|
||||
class TransposeIterator {
|
||||
public:
|
||||
TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape);
|
||||
virtual ~TransposeIterator() = default;
|
||||
inline size_t GetPos() const { return pos_; }
|
||||
void SetPos(size_t pos);
|
||||
void GenNextPos();
|
||||
|
||||
private:
|
||||
int dimension_{0};
|
||||
std::vector<size_t> coordinates_;
|
||||
std::vector<size_t> shape_;
|
||||
std::vector<size_t> strides_;
|
||||
std::vector<size_t> back_strides_;
|
||||
std::vector<size_t> axes_;
|
||||
size_t pos_{0};
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "backend/kernel_compiler/kernel.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "backend/kernel_compiler/common_utils.h"
|
||||
#include "ir/anf.h"
|
||||
|
||||
using mindspore::kernel::Address;
|
||||
using mindspore::kernel::AddressPtr;
|
||||
using CTask = std::function<void(size_t, size_t)>;
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
const char KERNEL_SIZE[] = "kernel_size";
|
||||
const char STRIDE[] = "stride";
|
||||
const char STRIDES[] = "strides";
|
||||
const char DILATION[] = "dilation";
|
||||
const char DILATIONS[] = "dilations";
|
||||
const char FORMAT[] = "format";
|
||||
const char PAD[] = "pad";
|
||||
const char PAD_LIST[] = "pad_list";
|
||||
const char PAD_MODE[] = "pad_mode";
|
||||
const char PAD_MODE_LOWER_SAME[] = "same";
|
||||
const char PAD_MODE_LOWER_VALID[] = "valid";
|
||||
const char PAD_MODE_UPPER_SAME[] = "SAME";
|
||||
const char PAD_MODE_UPPER_VALID[] = "VALID";
|
||||
const char TRANSPOSE_A[] = "transpose_a";
|
||||
const char TRANSPOSE_B[] = "transpose_b";
|
||||
const char IS_GRAD[] = "is_grad";
|
||||
const char TRANSPOSE_NO = 'N';
|
||||
const char TRANSPOSE_YES = 'T';
|
||||
const char AXIS[] = "axis";
|
||||
const char DIM[] = "dim";
|
||||
const char BEGIN[] = "begin";
|
||||
const char END[] = "end";
|
||||
const char SIZE[] = "size";
|
||||
const char USE_NESTEROV[] = "use_nesterov";
|
||||
const char GROUP[] = "group";
|
||||
const char START[] = "start";
|
||||
const char LIMIT[] = "limit";
|
||||
const char DELTA[] = "delta";
|
||||
const char SORTED[] = "sorted";
|
||||
const char ADJ_ST[] = "adjoint_st";
|
||||
const char ADJ_dT[] = "adjoint_dt";
|
||||
|
||||
enum OperateType {
|
||||
ADD = 0,
|
||||
SUB,
|
||||
MUL,
|
||||
DIV,
|
||||
SQUARE,
|
||||
SQRT,
|
||||
POW,
|
||||
REALDIV,
|
||||
FLOORDIV,
|
||||
MOD,
|
||||
FLOORMOD,
|
||||
NEG,
|
||||
LESS,
|
||||
ASSIGNADD,
|
||||
RELUGRAD,
|
||||
RELU6GRAD,
|
||||
ABSGRAD,
|
||||
TANHGRAD,
|
||||
SQRTGRAD,
|
||||
SIGMOIDGRAD,
|
||||
ONESLIKE,
|
||||
ZEROSLIKE,
|
||||
SIGN,
|
||||
EQUAL,
|
||||
NOTEQUAL,
|
||||
LESSEQUAL,
|
||||
LOGICALAND,
|
||||
LOGICALOR,
|
||||
LOGICALNOT,
|
||||
FLOOR,
|
||||
SQUAREDDIFFERENCE,
|
||||
GREATER,
|
||||
GREATEREQUAL,
|
||||
RECIPROCAL,
|
||||
GELU,
|
||||
GELUGRAD,
|
||||
ASIN,
|
||||
ACOS,
|
||||
ATAN,
|
||||
ASINGRAD,
|
||||
ACOSGRAD,
|
||||
ATANGRAD,
|
||||
SIN,
|
||||
COS,
|
||||
TAN,
|
||||
SINH,
|
||||
COSH,
|
||||
ASINH,
|
||||
ACOSH,
|
||||
ATANH,
|
||||
ASINHGRAD,
|
||||
ACOSHGRAD,
|
||||
ATAN2,
|
||||
RINT,
|
||||
ROUND,
|
||||
IDENTITY,
|
||||
};
|
||||
|
||||
class CPUKernel : public kernel::KernelMod {
|
||||
public:
|
||||
CPUKernel() = default;
|
||||
~CPUKernel() override = default;
|
||||
virtual void Init(const CNodePtr &kernel_node);
|
||||
virtual void InitKernel(const CNodePtr &kernel_node) = 0;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override {
|
||||
return Launch(inputs, workspace, outputs);
|
||||
};
|
||||
virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) = 0;
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
|
||||
|
||||
protected:
|
||||
virtual void InitInputOutputSize(const CNodePtr &kernel_node);
|
||||
std::vector<size_t> input_size_list_;
|
||||
std::vector<size_t> output_size_list_;
|
||||
std::vector<size_t> workspace_size_list_;
|
||||
};
|
||||
|
||||
class CPUKernelUtils {
|
||||
public:
|
||||
static void ExpandDimsTo4(std::vector<size_t> *shape);
|
||||
static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
|
||||
static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
|
||||
static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
|
||||
static void ParallelFor(const CTask &task, size_t count);
|
||||
static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis);
|
||||
static std::vector<size_t> GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y);
|
||||
};
|
||||
|
||||
class BroadcastIterator {
|
||||
public:
|
||||
BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
|
||||
std::vector<size_t> output_shape);
|
||||
virtual ~BroadcastIterator() = default;
|
||||
inline size_t GetInputPosA() const { return input_pos_[0]; }
|
||||
inline size_t GetInputPosB() const { return input_pos_[1]; }
|
||||
void SetPos(size_t pos);
|
||||
void GenNextPos();
|
||||
|
||||
private:
|
||||
void BroadcastShape();
|
||||
void InitStrides();
|
||||
|
||||
std::vector<size_t> coordinates_;
|
||||
std::vector<size_t> input_shape_a_;
|
||||
std::vector<size_t> input_shape_b_;
|
||||
std::vector<size_t> output_shape_;
|
||||
std::vector<size_t> input_strides_a_;
|
||||
std::vector<size_t> input_strides_b_;
|
||||
std::vector<size_t> input_back_strides_a_;
|
||||
std::vector<size_t> input_back_strides_b_;
|
||||
std::array<size_t, 2> input_pos_{0};
|
||||
int output_dimension_{0};
|
||||
};
|
||||
|
||||
class TransposeIterator {
|
||||
public:
|
||||
TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape);
|
||||
virtual ~TransposeIterator() = default;
|
||||
inline size_t GetPos() const { return pos_; }
|
||||
void SetPos(size_t pos);
|
||||
void GenNextPos();
|
||||
|
||||
private:
|
||||
int dimension_{0};
|
||||
std::vector<size_t> coordinates_;
|
||||
std::vector<size_t> shape_;
|
||||
std::vector<size_t> strides_;
|
||||
std::vector<size_t> back_strides_;
|
||||
std::vector<size_t> axes_;
|
||||
size_t pos_{0};
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
|
||||
|
|
|
@ -1,340 +1,340 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
CheckParam(kernel_node);
|
||||
probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
|
||||
if (probs_shape_.size() != 3) {
|
||||
MS_LOG(EXCEPTION) << "Probs dims: " << probs_shape_.size() << " not support.";
|
||||
}
|
||||
if (labels_dims_.size() != 1) {
|
||||
MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support.";
|
||||
}
|
||||
if (indice_dims_.size() != 2) {
|
||||
MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support.";
|
||||
}
|
||||
|
||||
preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated");
|
||||
ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated");
|
||||
ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs");
|
||||
|
||||
max_time_ = probs_shape_[0];
|
||||
batch_size_ = probs_shape_[1];
|
||||
num_class_ = probs_shape_[2];
|
||||
blank_index_ = num_class_ - 1;
|
||||
}
|
||||
|
||||
bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (dtype_ == kNumberTypeFloat16) {
|
||||
LaunchKernel<float16>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat32) {
|
||||
LaunchKernel<float>(inputs, outputs);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T LogSumExp(const T logprob1, const T logprob2) {
|
||||
T kLogZero_ = -std::numeric_limits<T>::infinity();
|
||||
if (logprob1 <= kLogZero_) {
|
||||
return logprob2;
|
||||
} else if (logprob2 <= kLogZero_) {
|
||||
return logprob1;
|
||||
} else {
|
||||
return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1)))
|
||||
: logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2)));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TT>
|
||||
void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank,
|
||||
const std::vector<std::vector<TT>> &y,
|
||||
std::vector<std::vector<TT>> *log_alpha_b) {
|
||||
int U = label_with_blank.size();
|
||||
int T = (*log_alpha_b)[0].size();
|
||||
TT kLogZero_ = -std::numeric_limits<TT>::infinity();
|
||||
|
||||
(*log_alpha_b)[0][0] = static_cast<TT>(log(y[blank_index_][0]));
|
||||
auto label_0 = (label_with_blank.size() > 1) ? label_with_blank[1] : blank_index_;
|
||||
if (label_with_blank.size() > 1) {
|
||||
(*log_alpha_b)[1][0] = static_cast<TT>(log(y[label_0][0]));
|
||||
}
|
||||
|
||||
for (int t = 1; t < T; ++t) {
|
||||
int low = std::max(0, U - (2 * (T - t)));
|
||||
int high = std::min(U, 2 * (t + 1));
|
||||
for (int u = low; u < high; ++u) {
|
||||
auto sum_log_alpha_b = kLogZero_;
|
||||
if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
|
||||
sum_log_alpha_b = (*log_alpha_b)[u][t - 1];
|
||||
}
|
||||
|
||||
if (u > 0) {
|
||||
sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 1][t - 1]);
|
||||
}
|
||||
|
||||
if (u > 1) {
|
||||
bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u - 2]);
|
||||
if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
|
||||
sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 2][t - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
(*log_alpha_b)[u][t] =
|
||||
static_cast<TT>(log(static_cast<TT>(y[label_with_blank[IntToSize(u)]][IntToSize(t)]))) + sum_log_alpha_b;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TT>
|
||||
void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank,
|
||||
const std::vector<std::vector<TT>> &y,
|
||||
std::vector<std::vector<TT>> *log_beta_b) {
|
||||
int T = (*log_beta_b)[0].size();
|
||||
int U = label_with_blank.size();
|
||||
if (U > 1) {
|
||||
for (int u = U - 2; u < U; ++u) {
|
||||
(*log_beta_b)[u][T - 1] = TT(0);
|
||||
}
|
||||
} else {
|
||||
(*log_beta_b)[0][T - 1] = TT(0);
|
||||
(*log_beta_b)[0][T - 2] = TT(0);
|
||||
}
|
||||
|
||||
for (int t = T - 2; t >= 0; --t) {
|
||||
int low = std::max(0, U - (2 * (T - t)));
|
||||
int high = std::min(U, 2 * (t + 1));
|
||||
for (int u = low; u < high; ++u) {
|
||||
if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
|
||||
(*log_beta_b)[u][t] =
|
||||
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u][t + 1] + TT(log(y[label_with_blank[u]][t + 1])));
|
||||
}
|
||||
|
||||
if (u + 1 < U) {
|
||||
(*log_beta_b)[u][t] =
|
||||
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 1][t + 1] + TT(log(y[label_with_blank[u + 1]][t + 1])));
|
||||
}
|
||||
|
||||
if (u + 2 < U) {
|
||||
bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u + 2]);
|
||||
if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
|
||||
(*log_beta_b)[u][t] =
|
||||
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 2][t + 1] + TT(log(y[label_with_blank[u + 2]][t + 1])));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TT>
|
||||
void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_blank,
|
||||
const std::vector<std::vector<TT>> &y,
|
||||
const std::vector<std::vector<TT>> &log_alpha_b,
|
||||
const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx,
|
||||
std::vector<std::vector<TT>> *dy) {
|
||||
auto dy_b = dy;
|
||||
TT kLogZero_ = -std::numeric_limits<TT>::infinity();
|
||||
if (log_pzx <= kLogZero_) {
|
||||
MS_LOG(INFO) << "No valid path found";
|
||||
return;
|
||||
}
|
||||
|
||||
size_t L = y.size();
|
||||
size_t T = y[0].size();
|
||||
size_t U = label_with_blank.size();
|
||||
|
||||
for (size_t t = 0; t < T; ++t) {
|
||||
std::vector<TT> prob_sum(L, kLogZero_);
|
||||
|
||||
for (size_t u = 0; u < U; ++u) {
|
||||
uint32_t l = label_with_blank[u];
|
||||
prob_sum[l] = LogSumExp(prob_sum[l], log_alpha_b[u][t] + log_beta_b[u][t]);
|
||||
}
|
||||
for (size_t l = 0; l < L; ++l) {
|
||||
(*dy_b)[l][t] = y[l][t] - static_cast<TT>(exp(prob_sum[l] - log_pzx));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
|
||||
std::vector<std::vector<uint32_t>> *label_with_blank) {
|
||||
for (size_t b = 0; b < batch_size_; ++b) {
|
||||
std::vector<uint32_t> l;
|
||||
const std::vector<uint32_t> &label = batch_label[b];
|
||||
bool has_blank = false;
|
||||
for (size_t i = 0; i < label.size(); ++i) {
|
||||
if (i == 0 || !preprocess_collapse_repeated_ || label[i] != label[i - 1]) {
|
||||
if (label[i] >= num_class_ - 1) {
|
||||
has_blank = true;
|
||||
} else {
|
||||
if (has_blank) {
|
||||
MS_LOG(EXCEPTION) << "Invalid labels(index >= num_class - 1) should not appear between two valid labels";
|
||||
}
|
||||
l.push_back(label[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!ignore_longer_outputs_than_inputs_) {
|
||||
if (l.size() > seq_len[b]) {
|
||||
MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets "
|
||||
<< seq_len[b] << "< " << l.size();
|
||||
}
|
||||
}
|
||||
|
||||
(*label_with_blank)[b].reserve(2 * l.size() + 1);
|
||||
for (auto l_i : l) {
|
||||
(*label_with_blank)[b].push_back(blank_index_);
|
||||
(*label_with_blank)[b].push_back(l_i);
|
||||
}
|
||||
(*label_with_blank)[b].push_back(blank_index_);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length,
|
||||
size_t num_class, size_t batch_size, size_t b) {
|
||||
for (size_t t = 0; t < sequence_length; ++t) {
|
||||
T maxCoeff(T(0));
|
||||
T sumCoeff(T(0));
|
||||
|
||||
for (size_t c = 0; c < num_class; ++c) {
|
||||
if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) {
|
||||
maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c];
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < num_class; ++c) {
|
||||
sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
|
||||
(*softmax_probs)[c][t] =
|
||||
static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < num_class; ++c) {
|
||||
(*softmax_probs)[c][t] /= sumCoeff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) {
|
||||
array2D->resize(row);
|
||||
for (size_t i = 0; i < row; ++i) {
|
||||
(*array2D)[i].resize(col, init_value);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
|
||||
auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr);
|
||||
auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr);
|
||||
auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr);
|
||||
auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr);
|
||||
|
||||
std::vector<std::vector<uint32_t>> label_batch;
|
||||
std::vector<std::vector<uint32_t>> labels_with_blank;
|
||||
std::vector<uint64_t> each_label_length;
|
||||
|
||||
label_batch.resize(batch_size_);
|
||||
labels_with_blank.resize(batch_size_);
|
||||
each_label_length.resize(batch_size_, 0);
|
||||
|
||||
T kLogZero_ = -std::numeric_limits<T>::infinity();
|
||||
// check validation of sequence length
|
||||
for (size_t b = 0; b < batch_size_; ++b) {
|
||||
if (sequence_length_addr[b] == uint32_t(0)) {
|
||||
MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b];
|
||||
}
|
||||
|
||||
if (sequence_length_addr[b] > max_time_) {
|
||||
MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < "
|
||||
<< sequence_length_addr[b];
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < indice_dims_[0]; ++i) {
|
||||
each_label_length[labels_indices_addr[i * 2]]++;
|
||||
}
|
||||
|
||||
// convert label format of label_value and label_indices to batch_label
|
||||
uint64_t cum_sum = 0;
|
||||
for (size_t b = 0; b < batch_size_; ++b) {
|
||||
std::vector<uint32_t> *b_value = &label_batch[b];
|
||||
for (size_t l = 0; l < each_label_length[b]; ++l) {
|
||||
b_value->push_back(labels_values_addr[cum_sum + l]);
|
||||
}
|
||||
cum_sum += each_label_length[b];
|
||||
}
|
||||
|
||||
// convert label to label with blank
|
||||
GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank);
|
||||
|
||||
for (size_t b = 0; b < batch_size_; ++b) {
|
||||
std::vector<uint32_t> label_with_blank = labels_with_blank[b];
|
||||
// y_b [num_class, sequence_length]
|
||||
std::vector<std::vector<T>> y_b;
|
||||
std::vector<std::vector<T>> dy;
|
||||
std::vector<std::vector<T>> log_alpha_b;
|
||||
std::vector<std::vector<T>> log_beta_b;
|
||||
MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_);
|
||||
MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0));
|
||||
MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_);
|
||||
MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_);
|
||||
InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b);
|
||||
|
||||
CalculateFwdVar(label_with_blank, y_b, &log_alpha_b);
|
||||
CalculateBwdVar(label_with_blank, y_b, &log_beta_b);
|
||||
|
||||
T log_pzx = kLogZero_;
|
||||
for (size_t u = 0; u < label_with_blank.size(); ++u) {
|
||||
log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]);
|
||||
}
|
||||
|
||||
loss_addr[b] = -log_pzx;
|
||||
|
||||
CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy);
|
||||
|
||||
for (size_t t = 0; t < sequence_length_addr[b]; ++t) {
|
||||
for (size_t c = 0; c < num_class_; ++c) {
|
||||
gradient_addr[t * batch_size_ * num_class_ + b * num_class_ + c] = dy[c][t];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 2) {
|
||||
MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num;
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
CheckParam(kernel_node);
|
||||
probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
|
||||
if (probs_shape_.size() != 3) {
|
||||
MS_LOG(EXCEPTION) << "Probs dims: " << probs_shape_.size() << " not support.";
|
||||
}
|
||||
if (labels_dims_.size() != 1) {
|
||||
MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support.";
|
||||
}
|
||||
if (indice_dims_.size() != 2) {
|
||||
MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support.";
|
||||
}
|
||||
|
||||
preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated");
|
||||
ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated");
|
||||
ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs");
|
||||
|
||||
max_time_ = probs_shape_[0];
|
||||
batch_size_ = probs_shape_[1];
|
||||
num_class_ = probs_shape_[2];
|
||||
blank_index_ = num_class_ - 1;
|
||||
}
|
||||
|
||||
bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (dtype_ == kNumberTypeFloat16) {
|
||||
LaunchKernel<float16>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat32) {
|
||||
LaunchKernel<float>(inputs, outputs);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T LogSumExp(const T logprob1, const T logprob2) {
|
||||
T kLogZero_ = -std::numeric_limits<T>::infinity();
|
||||
if (logprob1 <= kLogZero_) {
|
||||
return logprob2;
|
||||
} else if (logprob2 <= kLogZero_) {
|
||||
return logprob1;
|
||||
} else {
|
||||
return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1)))
|
||||
: logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2)));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TT>
|
||||
void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank,
|
||||
const std::vector<std::vector<TT>> &y,
|
||||
std::vector<std::vector<TT>> *log_alpha_b) {
|
||||
int U = label_with_blank.size();
|
||||
int T = (*log_alpha_b)[0].size();
|
||||
TT kLogZero_ = -std::numeric_limits<TT>::infinity();
|
||||
|
||||
(*log_alpha_b)[0][0] = static_cast<TT>(log(y[blank_index_][0]));
|
||||
auto label_0 = (label_with_blank.size() > 1) ? label_with_blank[1] : blank_index_;
|
||||
if (label_with_blank.size() > 1) {
|
||||
(*log_alpha_b)[1][0] = static_cast<TT>(log(y[label_0][0]));
|
||||
}
|
||||
|
||||
for (int t = 1; t < T; ++t) {
|
||||
int low = std::max(0, U - (2 * (T - t)));
|
||||
int high = std::min(U, 2 * (t + 1));
|
||||
for (int u = low; u < high; ++u) {
|
||||
auto sum_log_alpha_b = kLogZero_;
|
||||
if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
|
||||
sum_log_alpha_b = (*log_alpha_b)[u][t - 1];
|
||||
}
|
||||
|
||||
if (u > 0) {
|
||||
sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 1][t - 1]);
|
||||
}
|
||||
|
||||
if (u > 1) {
|
||||
bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u - 2]);
|
||||
if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
|
||||
sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 2][t - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
(*log_alpha_b)[u][t] =
|
||||
static_cast<TT>(log(static_cast<TT>(y[label_with_blank[IntToSize(u)]][IntToSize(t)]))) + sum_log_alpha_b;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TT>
|
||||
void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank,
|
||||
const std::vector<std::vector<TT>> &y,
|
||||
std::vector<std::vector<TT>> *log_beta_b) {
|
||||
int T = (*log_beta_b)[0].size();
|
||||
int U = label_with_blank.size();
|
||||
if (U > 1) {
|
||||
for (int u = U - 2; u < U; ++u) {
|
||||
(*log_beta_b)[u][T - 1] = TT(0);
|
||||
}
|
||||
} else {
|
||||
(*log_beta_b)[0][T - 1] = TT(0);
|
||||
(*log_beta_b)[0][T - 2] = TT(0);
|
||||
}
|
||||
|
||||
for (int t = T - 2; t >= 0; --t) {
|
||||
int low = std::max(0, U - (2 * (T - t)));
|
||||
int high = std::min(U, 2 * (t + 1));
|
||||
for (int u = low; u < high; ++u) {
|
||||
if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
|
||||
(*log_beta_b)[u][t] =
|
||||
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u][t + 1] + TT(log(y[label_with_blank[u]][t + 1])));
|
||||
}
|
||||
|
||||
if (u + 1 < U) {
|
||||
(*log_beta_b)[u][t] =
|
||||
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 1][t + 1] + TT(log(y[label_with_blank[u + 1]][t + 1])));
|
||||
}
|
||||
|
||||
if (u + 2 < U) {
|
||||
bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u + 2]);
|
||||
if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
|
||||
(*log_beta_b)[u][t] =
|
||||
LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 2][t + 1] + TT(log(y[label_with_blank[u + 2]][t + 1])));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TT>
|
||||
void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_blank,
|
||||
const std::vector<std::vector<TT>> &y,
|
||||
const std::vector<std::vector<TT>> &log_alpha_b,
|
||||
const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx,
|
||||
std::vector<std::vector<TT>> *dy) {
|
||||
auto dy_b = dy;
|
||||
TT kLogZero_ = -std::numeric_limits<TT>::infinity();
|
||||
if (log_pzx <= kLogZero_) {
|
||||
MS_LOG(INFO) << "No valid path found";
|
||||
return;
|
||||
}
|
||||
|
||||
size_t L = y.size();
|
||||
size_t T = y[0].size();
|
||||
size_t U = label_with_blank.size();
|
||||
|
||||
for (size_t t = 0; t < T; ++t) {
|
||||
std::vector<TT> prob_sum(L, kLogZero_);
|
||||
|
||||
for (size_t u = 0; u < U; ++u) {
|
||||
uint32_t l = label_with_blank[u];
|
||||
prob_sum[l] = LogSumExp(prob_sum[l], log_alpha_b[u][t] + log_beta_b[u][t]);
|
||||
}
|
||||
for (size_t l = 0; l < L; ++l) {
|
||||
(*dy_b)[l][t] = y[l][t] - static_cast<TT>(exp(prob_sum[l] - log_pzx));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
|
||||
std::vector<std::vector<uint32_t>> *label_with_blank) {
|
||||
for (size_t b = 0; b < batch_size_; ++b) {
|
||||
std::vector<uint32_t> l;
|
||||
const std::vector<uint32_t> &label = batch_label[b];
|
||||
bool has_blank = false;
|
||||
for (size_t i = 0; i < label.size(); ++i) {
|
||||
if (i == 0 || !preprocess_collapse_repeated_ || label[i] != label[i - 1]) {
|
||||
if (label[i] >= num_class_ - 1) {
|
||||
has_blank = true;
|
||||
} else {
|
||||
if (has_blank) {
|
||||
MS_LOG(EXCEPTION) << "Invalid labels(index >= num_class - 1) should not appear between two valid labels";
|
||||
}
|
||||
l.push_back(label[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!ignore_longer_outputs_than_inputs_) {
|
||||
if (l.size() > seq_len[b]) {
|
||||
MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets "
|
||||
<< seq_len[b] << "< " << l.size();
|
||||
}
|
||||
}
|
||||
|
||||
(*label_with_blank)[b].reserve(2 * l.size() + 1);
|
||||
for (auto l_i : l) {
|
||||
(*label_with_blank)[b].push_back(blank_index_);
|
||||
(*label_with_blank)[b].push_back(l_i);
|
||||
}
|
||||
(*label_with_blank)[b].push_back(blank_index_);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length,
|
||||
size_t num_class, size_t batch_size, size_t b) {
|
||||
for (size_t t = 0; t < sequence_length; ++t) {
|
||||
T maxCoeff(T(0));
|
||||
T sumCoeff(T(0));
|
||||
|
||||
for (size_t c = 0; c < num_class; ++c) {
|
||||
if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) {
|
||||
maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c];
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < num_class; ++c) {
|
||||
sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
|
||||
(*softmax_probs)[c][t] =
|
||||
static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < num_class; ++c) {
|
||||
(*softmax_probs)[c][t] /= sumCoeff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) {
|
||||
array2D->resize(row);
|
||||
for (size_t i = 0; i < row; ++i) {
|
||||
(*array2D)[i].resize(col, init_value);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
|
||||
auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr);
|
||||
auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr);
|
||||
auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr);
|
||||
auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr);
|
||||
|
||||
std::vector<std::vector<uint32_t>> label_batch;
|
||||
std::vector<std::vector<uint32_t>> labels_with_blank;
|
||||
std::vector<uint64_t> each_label_length;
|
||||
|
||||
label_batch.resize(batch_size_);
|
||||
labels_with_blank.resize(batch_size_);
|
||||
each_label_length.resize(batch_size_, 0);
|
||||
|
||||
T kLogZero_ = -std::numeric_limits<T>::infinity();
|
||||
// check validation of sequence length
|
||||
for (size_t b = 0; b < batch_size_; ++b) {
|
||||
if (sequence_length_addr[b] == uint32_t(0)) {
|
||||
MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b];
|
||||
}
|
||||
|
||||
if (sequence_length_addr[b] > max_time_) {
|
||||
MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < "
|
||||
<< sequence_length_addr[b];
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < indice_dims_[0]; ++i) {
|
||||
each_label_length[labels_indices_addr[i * 2]]++;
|
||||
}
|
||||
|
||||
// convert label format of label_value and label_indices to batch_label
|
||||
uint64_t cum_sum = 0;
|
||||
for (size_t b = 0; b < batch_size_; ++b) {
|
||||
std::vector<uint32_t> *b_value = &label_batch[b];
|
||||
for (size_t l = 0; l < each_label_length[b]; ++l) {
|
||||
b_value->push_back(labels_values_addr[cum_sum + l]);
|
||||
}
|
||||
cum_sum += each_label_length[b];
|
||||
}
|
||||
|
||||
// convert label to label with blank
|
||||
GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank);
|
||||
|
||||
for (size_t b = 0; b < batch_size_; ++b) {
|
||||
std::vector<uint32_t> label_with_blank = labels_with_blank[b];
|
||||
// y_b [num_class, sequence_length]
|
||||
std::vector<std::vector<T>> y_b;
|
||||
std::vector<std::vector<T>> dy;
|
||||
std::vector<std::vector<T>> log_alpha_b;
|
||||
std::vector<std::vector<T>> log_beta_b;
|
||||
MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_);
|
||||
MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0));
|
||||
MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_);
|
||||
MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_);
|
||||
InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b);
|
||||
|
||||
CalculateFwdVar(label_with_blank, y_b, &log_alpha_b);
|
||||
CalculateBwdVar(label_with_blank, y_b, &log_beta_b);
|
||||
|
||||
T log_pzx = kLogZero_;
|
||||
for (size_t u = 0; u < label_with_blank.size(); ++u) {
|
||||
log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]);
|
||||
}
|
||||
|
||||
loss_addr[b] = -log_pzx;
|
||||
|
||||
CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy);
|
||||
|
||||
for (size_t t = 0; t < sequence_length_addr[b]; ++t) {
|
||||
for (size_t c = 0; c < num_class_; ++c) {
|
||||
gradient_addr[t * batch_size_ * num_class_ + b * num_class_ + c] = dy[c][t];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 2) {
|
||||
MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num;
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,92 +1,92 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class CTCLossCPUKernel : public CPUKernel {
|
||||
public:
|
||||
CTCLossCPUKernel() = default;
|
||||
~CTCLossCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
|
||||
std::vector<std::vector<uint32_t>> *label_with_blank);
|
||||
|
||||
template <typename T>
|
||||
void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
|
||||
std::vector<std::vector<T>> *log_alpha_b);
|
||||
template <typename T>
|
||||
void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
|
||||
std::vector<std::vector<T>> *log_beta_b);
|
||||
template <typename T>
|
||||
void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
|
||||
const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b,
|
||||
const T log_pzx, std::vector<std::vector<T>> *dy);
|
||||
|
||||
template <typename T>
|
||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
std::vector<size_t> probs_shape_;
|
||||
std::vector<size_t> indice_dims_;
|
||||
std::vector<size_t> labels_dims_;
|
||||
size_t num_class_;
|
||||
size_t max_time_;
|
||||
size_t batch_size_;
|
||||
uint32_t blank_index_;
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
bool preprocess_collapse_repeated_;
|
||||
bool ctc_merge_repeated_;
|
||||
bool ignore_longer_outputs_than_inputs_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(CTCLoss,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
CTCLossCPUKernel);
|
||||
|
||||
MS_REG_CPU_KERNEL(CTCLoss,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
CTCLossCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class CTCLossCPUKernel : public CPUKernel {
|
||||
public:
|
||||
CTCLossCPUKernel() = default;
|
||||
~CTCLossCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
|
||||
std::vector<std::vector<uint32_t>> *label_with_blank);
|
||||
|
||||
template <typename T>
|
||||
void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
|
||||
std::vector<std::vector<T>> *log_alpha_b);
|
||||
template <typename T>
|
||||
void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
|
||||
std::vector<std::vector<T>> *log_beta_b);
|
||||
template <typename T>
|
||||
void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
|
||||
const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b,
|
||||
const T log_pzx, std::vector<std::vector<T>> *dy);
|
||||
|
||||
template <typename T>
|
||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
std::vector<size_t> probs_shape_;
|
||||
std::vector<size_t> indice_dims_;
|
||||
std::vector<size_t> labels_dims_;
|
||||
size_t num_class_;
|
||||
size_t max_time_;
|
||||
size_t batch_size_;
|
||||
uint32_t blank_index_;
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
bool preprocess_collapse_repeated_;
|
||||
bool ctc_merge_repeated_;
|
||||
bool ignore_longer_outputs_than_inputs_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(CTCLoss,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
CTCLossCPUKernel);
|
||||
|
||||
MS_REG_CPU_KERNEL(CTCLoss,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
CTCLossCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
|
||||
|
|
|
@ -1,89 +1,89 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
CheckParam(kernel_node);
|
||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> & /*workspace*/,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
size_t size = IntToSize(inputs[0]->size / sizeof(T));
|
||||
std::vector<size_t> input_shape = input_shape_;
|
||||
std::vector<size_t> output_shape = output_shape_;
|
||||
size_t block_size = block_size_;
|
||||
size_t input_dimension = input_shape.size();
|
||||
size_t output_strides[3] = {1, 1, 1};
|
||||
|
||||
for (size_t i = input_dimension - 1; i >= 1; --i) {
|
||||
for (size_t j = 0; j < i; ++j) {
|
||||
output_strides[j] *= output_shape[i];
|
||||
}
|
||||
}
|
||||
|
||||
auto task = [&, input_addr, output_addr](size_t start, size_t end) {
|
||||
std::vector<size_t> output_pos_array(input_dimension, 0);
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
size_t tmp_pos = i;
|
||||
for (size_t j = 0; j < input_dimension - 1; ++j) {
|
||||
output_pos_array[j] = tmp_pos / output_strides[j];
|
||||
tmp_pos %= output_strides[j];
|
||||
}
|
||||
output_pos_array.back() = tmp_pos;
|
||||
size_t input_pos = output_pos_array[0];
|
||||
input_pos =
|
||||
(input_pos * input_shape[1]) +
|
||||
(output_pos_array[1] +
|
||||
(block_size * (output_pos_array[2] % block_size) + output_pos_array[3] % block_size) * output_shape[1]);
|
||||
input_pos = (input_pos * input_shape[2]) + (output_pos_array[2] / block_size);
|
||||
input_pos = (input_pos * input_shape[3]) + (output_pos_array[3] / block_size);
|
||||
output_addr[i] = input_addr[input_pos];
|
||||
}
|
||||
};
|
||||
|
||||
CPUKernelUtils::ParallelFor(task, size);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
CheckParam(kernel_node);
|
||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> & /*workspace*/,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
size_t size = IntToSize(inputs[0]->size / sizeof(T));
|
||||
std::vector<size_t> input_shape = input_shape_;
|
||||
std::vector<size_t> output_shape = output_shape_;
|
||||
size_t block_size = block_size_;
|
||||
size_t input_dimension = input_shape.size();
|
||||
size_t output_strides[3] = {1, 1, 1};
|
||||
|
||||
for (size_t i = input_dimension - 1; i >= 1; --i) {
|
||||
for (size_t j = 0; j < i; ++j) {
|
||||
output_strides[j] *= output_shape[i];
|
||||
}
|
||||
}
|
||||
|
||||
auto task = [&, input_addr, output_addr](size_t start, size_t end) {
|
||||
std::vector<size_t> output_pos_array(input_dimension, 0);
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
size_t tmp_pos = i;
|
||||
for (size_t j = 0; j < input_dimension - 1; ++j) {
|
||||
output_pos_array[j] = tmp_pos / output_strides[j];
|
||||
tmp_pos %= output_strides[j];
|
||||
}
|
||||
output_pos_array.back() = tmp_pos;
|
||||
size_t input_pos = output_pos_array[0];
|
||||
input_pos =
|
||||
(input_pos * input_shape[1]) +
|
||||
(output_pos_array[1] +
|
||||
(block_size * (output_pos_array[2] % block_size) + output_pos_array[3] % block_size) * output_shape[1]);
|
||||
input_pos = (input_pos * input_shape[2]) + (output_pos_array[2] / block_size);
|
||||
input_pos = (input_pos * input_shape[3]) + (output_pos_array[3] / block_size);
|
||||
output_addr[i] = input_addr[input_pos];
|
||||
}
|
||||
};
|
||||
|
||||
CPUKernelUtils::ParallelFor(task, size);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,85 +1,85 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
class DepthToSpaceCPUKernel : public CPUKernel {
|
||||
public:
|
||||
DepthToSpaceCPUKernel() = default;
|
||||
~DepthToSpaceCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
std::vector<size_t> input_shape_;
|
||||
std::vector<size_t> output_shape_;
|
||||
size_t block_size_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL_T(
|
||||
DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
DepthToSpaceCPUKernel, float);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(
|
||||
DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
DepthToSpaceCPUKernel, float16);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
|
||||
DepthToSpaceCPUKernel, int8_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
|
||||
DepthToSpaceCPUKernel, int16_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
DepthToSpaceCPUKernel, int);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
||||
DepthToSpaceCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
|
||||
DepthToSpaceCPUKernel, uint8_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
|
||||
DepthToSpaceCPUKernel, uint16_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
|
||||
DepthToSpaceCPUKernel, uint32_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
|
||||
DepthToSpaceCPUKernel, uint64_t);
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
class DepthToSpaceCPUKernel : public CPUKernel {
|
||||
public:
|
||||
DepthToSpaceCPUKernel() = default;
|
||||
~DepthToSpaceCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
std::vector<size_t> input_shape_;
|
||||
std::vector<size_t> output_shape_;
|
||||
size_t block_size_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL_T(
|
||||
DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
DepthToSpaceCPUKernel, float);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(
|
||||
DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
DepthToSpaceCPUKernel, float16);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
|
||||
DepthToSpaceCPUKernel, int8_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
|
||||
DepthToSpaceCPUKernel, int16_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
DepthToSpaceCPUKernel, int);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
||||
DepthToSpaceCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
|
||||
DepthToSpaceCPUKernel, uint8_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
|
||||
DepthToSpaceCPUKernel, uint16_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
|
||||
DepthToSpaceCPUKernel, uint32_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(DepthToSpace,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
|
||||
DepthToSpaceCPUKernel, uint64_t);
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
|
||||
|
|
|
@ -1,102 +1,102 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h"
|
||||
#include "backend/kernel_compiler/cpu/nnacl/errorcode.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "common/thread_pool.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) {
|
||||
int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start);
|
||||
if (ret != NNACL_OK) {
|
||||
MS_LOG(EXCEPTION) << "Add failed.";
|
||||
}
|
||||
}
|
||||
|
||||
void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
CheckParam(kernel_node);
|
||||
input_num_ = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
|
||||
std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape);
|
||||
dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape);
|
||||
dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape);
|
||||
dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc);
|
||||
auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
|
||||
primitive_ = std::make_shared<dnnl::binary>(prim_desc);
|
||||
AddArgument(DNNL_ARG_SRC_0, src0_mem_desc);
|
||||
AddArgument(DNNL_ARG_SRC_1, src1_mem_desc);
|
||||
AddArgument(DNNL_ARG_DST, dst_mem_desc);
|
||||
}
|
||||
|
||||
bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (dtype_ == kNumberTypeFloat32) {
|
||||
SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
|
||||
ExecutePrimitive();
|
||||
for (size_t index = 2; index < input_num_; ++index) {
|
||||
SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
|
||||
ExecutePrimitive();
|
||||
}
|
||||
} else if (dtype_ == kNumberTypeInt32) {
|
||||
size_t elements_num = outputs[0]->size / sizeof(int);
|
||||
const auto input_0 = reinterpret_cast<int *>(inputs[0]->addr);
|
||||
const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr);
|
||||
auto output = reinterpret_cast<int *>(outputs[0]->addr);
|
||||
auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
|
||||
CPUKernelUtils::ParallelFor(task_0, elements_num);
|
||||
for (size_t index = 2; index < input_num_; ++index) {
|
||||
const auto input = reinterpret_cast<int *>(inputs[index]->addr);
|
||||
auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2);
|
||||
CPUKernelUtils::ParallelFor(task, elements_num);
|
||||
}
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "AddN only support float32 and int32, but got " << TypeIdToType(dtype_)->ToString();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) {
|
||||
auto src0_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
if (src0_shape != dst_shape) {
|
||||
MS_LOG(EXCEPTION) << "AddN output shape must be equal to input shape.";
|
||||
}
|
||||
for (size_t index = 1; index < input_num_; ++index) {
|
||||
auto src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index);
|
||||
if (src0_shape != src_shape) {
|
||||
MS_LOG(EXCEPTION) << "AddN input shapes must be equal.";
|
||||
}
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output.";
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h"
|
||||
#include "backend/kernel_compiler/cpu/nnacl/errorcode.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "common/thread_pool.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) {
|
||||
int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start);
|
||||
if (ret != NNACL_OK) {
|
||||
MS_LOG(EXCEPTION) << "Add failed.";
|
||||
}
|
||||
}
|
||||
|
||||
void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
CheckParam(kernel_node);
|
||||
input_num_ = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
|
||||
std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape);
|
||||
dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape);
|
||||
dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape);
|
||||
dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc);
|
||||
auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
|
||||
primitive_ = std::make_shared<dnnl::binary>(prim_desc);
|
||||
AddArgument(DNNL_ARG_SRC_0, src0_mem_desc);
|
||||
AddArgument(DNNL_ARG_SRC_1, src1_mem_desc);
|
||||
AddArgument(DNNL_ARG_DST, dst_mem_desc);
|
||||
}
|
||||
|
||||
bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (dtype_ == kNumberTypeFloat32) {
|
||||
SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
|
||||
ExecutePrimitive();
|
||||
for (size_t index = 2; index < input_num_; ++index) {
|
||||
SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
|
||||
ExecutePrimitive();
|
||||
}
|
||||
} else if (dtype_ == kNumberTypeInt32) {
|
||||
size_t elements_num = outputs[0]->size / sizeof(int);
|
||||
const auto input_0 = reinterpret_cast<int *>(inputs[0]->addr);
|
||||
const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr);
|
||||
auto output = reinterpret_cast<int *>(outputs[0]->addr);
|
||||
auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
|
||||
CPUKernelUtils::ParallelFor(task_0, elements_num);
|
||||
for (size_t index = 2; index < input_num_; ++index) {
|
||||
const auto input = reinterpret_cast<int *>(inputs[index]->addr);
|
||||
auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2);
|
||||
CPUKernelUtils::ParallelFor(task, elements_num);
|
||||
}
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "AddN only support float32 and int32, but got " << TypeIdToType(dtype_)->ToString();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) {
|
||||
auto src0_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
if (src0_shape != dst_shape) {
|
||||
MS_LOG(EXCEPTION) << "AddN output shape must be equal to input shape.";
|
||||
}
|
||||
for (size_t index = 1; index < input_num_; ++index) {
|
||||
auto src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index);
|
||||
if (src0_shape != src_shape) {
|
||||
MS_LOG(EXCEPTION) << "AddN input shapes must be equal.";
|
||||
}
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output.";
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,51 +1,51 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class AddNCPUKernel : public MKLCPUKernel {
|
||||
public:
|
||||
AddNCPUKernel() = default;
|
||||
~AddNCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
size_t input_num_{0};
|
||||
std::vector<size_t> output_shape_;
|
||||
TypeId dtype_{kNumberTypeFloat32};
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(AddN,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
AddNCPUKernel);
|
||||
MS_REG_CPU_KERNEL(AddN,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
AddNCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class AddNCPUKernel : public MKLCPUKernel {
|
||||
public:
|
||||
AddNCPUKernel() = default;
|
||||
~AddNCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
size_t input_num_{0};
|
||||
std::vector<size_t> output_shape_;
|
||||
TypeId dtype_{kNumberTypeFloat32};
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(AddN,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
AddNCPUKernel);
|
||||
MS_REG_CPU_KERNEL(AddN,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
AddNCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
|
||||
|
|
|
@ -1,178 +1,178 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h"
|
||||
#include <string>
|
||||
#include "utils/ms_utils.h"
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
const int kMaxLSTMLayer = 100;
|
||||
const int kOutputWorkSpaceIndex = 3;
|
||||
void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
|
||||
CPUKernel::InitInputOutputSize(kernel_node);
|
||||
output_size_list_[kOutputWorkSpaceIndex] = reserve_size_;
|
||||
auto output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0);
|
||||
auto output_types = std::vector<TypeId>(output_num, output_type);
|
||||
std::vector<std::vector<size_t>> output_shapes;
|
||||
for (size_t output_index = 0; output_index < output_num; ++output_index) {
|
||||
std::vector<size_t> shape = AnfAlgo::GetOutputInferShape(kernel_node, output_index);
|
||||
output_shapes.emplace_back(shape);
|
||||
}
|
||||
size_t len = reserve_size_ / 4;
|
||||
output_shapes[kOutputWorkSpaceIndex] = {len, 1};
|
||||
AnfAlgo::SetOutputInferTypeAndShape(output_types, output_shapes, kernel_node.get());
|
||||
}
|
||||
|
||||
void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
#ifdef PLATFORM_86
|
||||
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
|
||||
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
|
||||
#endif
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
using tag = dnnl::memory::format_tag;
|
||||
using dim = dnnl::memory::dims;
|
||||
CheckParam(kernel_node);
|
||||
auto eng = MKLKernelEngine::Get().engine();
|
||||
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
|
||||
if (bidirectional_) {
|
||||
direction = dnnl::rnn_direction::bidirectional_concat;
|
||||
}
|
||||
dim src_dims = {seq_len_, batch_size_, input_size_};
|
||||
dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
|
||||
weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
|
||||
bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
|
||||
dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
|
||||
dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
|
||||
dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
|
||||
dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
|
||||
dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
|
||||
dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
|
||||
dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
|
||||
dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
|
||||
if (!kernel_node->HasAttr(kAttrIsTraining)) {
|
||||
is_training = true;
|
||||
} else {
|
||||
is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining));
|
||||
}
|
||||
auto prop_kind = dnnl::prop_kind::forward_training;
|
||||
if (!is_training) {
|
||||
prop_kind = dnnl::prop_kind::forward_inference;
|
||||
}
|
||||
auto desc = std::make_shared<dnnl::lstm_forward::desc>(
|
||||
prop_kind, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
|
||||
formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc);
|
||||
prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng);
|
||||
primitive_ = std::make_shared<dnnl::lstm_forward>(prim_desc_);
|
||||
if (is_training) {
|
||||
reserve_size_ = static_cast<size_t>(prim_desc_.workspace_desc().get_size());
|
||||
AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc());
|
||||
} else {
|
||||
reserve_size_ = 1;
|
||||
}
|
||||
AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
|
||||
AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc());
|
||||
AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc());
|
||||
AddArgument(DNNL_ARG_BIAS, bias_desc);
|
||||
AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
|
||||
AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
|
||||
AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
|
||||
}
|
||||
|
||||
void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) {
|
||||
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
|
||||
std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
|
||||
bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
|
||||
input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"));
|
||||
hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"));
|
||||
num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"));
|
||||
has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
|
||||
batch_size_ = SizeToInt(src_shape[1]);
|
||||
seq_len_ = SizeToInt(src_shape[0]);
|
||||
num_directions_ = 1;
|
||||
if (bidirectional_) {
|
||||
num_directions_ = 2;
|
||||
}
|
||||
const int gate_size = 4 * hidden_size_;
|
||||
if (num_layers_ <= 0) {
|
||||
MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
|
||||
}
|
||||
if (num_layers_ > kMaxLSTMLayer) {
|
||||
MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
|
||||
}
|
||||
for (int i = 0; i < num_layers_; ++i) {
|
||||
weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
|
||||
weight_h_size_ += gate_size * hidden_size_;
|
||||
}
|
||||
weight_size_ = weight_size_ * num_directions_;
|
||||
weight_h_size_ = weight_h_size_ * num_directions_;
|
||||
if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) {
|
||||
MS_LOG(EXCEPTION) << "Error iteration shape!";
|
||||
}
|
||||
if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
|
||||
MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
|
||||
}
|
||||
}
|
||||
|
||||
bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
using dt = dnnl::memory::data_type;
|
||||
using tag = dnnl::memory::format_tag;
|
||||
auto eng = MKLKernelEngine::Get().engine();
|
||||
auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng);
|
||||
auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng);
|
||||
user_weights_memory.set_data_handle(inputs[3]->addr);
|
||||
user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
|
||||
Reorder(&user_weights_memory, &weights_memory);
|
||||
Reorder(&user_weights_h_memory, &weights_h_memory);
|
||||
auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng);
|
||||
if (has_bias_) {
|
||||
bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
|
||||
} else {
|
||||
if (memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0,
|
||||
prim_desc_.bias_desc().get_size())) {
|
||||
MS_LOG(EXCEPTION) << "Bias memset error";
|
||||
}
|
||||
}
|
||||
// set handle
|
||||
SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr);
|
||||
if (is_training) {
|
||||
SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr);
|
||||
}
|
||||
ExecutePrimitive();
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h"
|
||||
#include <string>
|
||||
#include "utils/ms_utils.h"
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
const int kMaxLSTMLayer = 100;
|
||||
const int kOutputWorkSpaceIndex = 3;
|
||||
void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
|
||||
CPUKernel::InitInputOutputSize(kernel_node);
|
||||
output_size_list_[kOutputWorkSpaceIndex] = reserve_size_;
|
||||
auto output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0);
|
||||
auto output_types = std::vector<TypeId>(output_num, output_type);
|
||||
std::vector<std::vector<size_t>> output_shapes;
|
||||
for (size_t output_index = 0; output_index < output_num; ++output_index) {
|
||||
std::vector<size_t> shape = AnfAlgo::GetOutputInferShape(kernel_node, output_index);
|
||||
output_shapes.emplace_back(shape);
|
||||
}
|
||||
size_t len = reserve_size_ / 4;
|
||||
output_shapes[kOutputWorkSpaceIndex] = {len, 1};
|
||||
AnfAlgo::SetOutputInferTypeAndShape(output_types, output_shapes, kernel_node.get());
|
||||
}
|
||||
|
||||
void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
#ifdef PLATFORM_86
|
||||
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
|
||||
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
|
||||
#endif
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
using tag = dnnl::memory::format_tag;
|
||||
using dim = dnnl::memory::dims;
|
||||
CheckParam(kernel_node);
|
||||
auto eng = MKLKernelEngine::Get().engine();
|
||||
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
|
||||
if (bidirectional_) {
|
||||
direction = dnnl::rnn_direction::bidirectional_concat;
|
||||
}
|
||||
dim src_dims = {seq_len_, batch_size_, input_size_};
|
||||
dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
|
||||
weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
|
||||
bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
|
||||
dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
|
||||
dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
|
||||
dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
|
||||
dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
|
||||
dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
|
||||
dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
|
||||
dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
|
||||
dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
|
||||
if (!kernel_node->HasAttr(kAttrIsTraining)) {
|
||||
is_training = true;
|
||||
} else {
|
||||
is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining));
|
||||
}
|
||||
auto prop_kind = dnnl::prop_kind::forward_training;
|
||||
if (!is_training) {
|
||||
prop_kind = dnnl::prop_kind::forward_inference;
|
||||
}
|
||||
auto desc = std::make_shared<dnnl::lstm_forward::desc>(
|
||||
prop_kind, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
|
||||
formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc);
|
||||
prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng);
|
||||
primitive_ = std::make_shared<dnnl::lstm_forward>(prim_desc_);
|
||||
if (is_training) {
|
||||
reserve_size_ = static_cast<size_t>(prim_desc_.workspace_desc().get_size());
|
||||
AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc());
|
||||
} else {
|
||||
reserve_size_ = 1;
|
||||
}
|
||||
AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
|
||||
AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc());
|
||||
AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc());
|
||||
AddArgument(DNNL_ARG_BIAS, bias_desc);
|
||||
AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
|
||||
AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
|
||||
AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
|
||||
}
|
||||
|
||||
void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) {
|
||||
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
|
||||
std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
|
||||
bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
|
||||
input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"));
|
||||
hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"));
|
||||
num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"));
|
||||
has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
|
||||
batch_size_ = SizeToInt(src_shape[1]);
|
||||
seq_len_ = SizeToInt(src_shape[0]);
|
||||
num_directions_ = 1;
|
||||
if (bidirectional_) {
|
||||
num_directions_ = 2;
|
||||
}
|
||||
const int gate_size = 4 * hidden_size_;
|
||||
if (num_layers_ <= 0) {
|
||||
MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
|
||||
}
|
||||
if (num_layers_ > kMaxLSTMLayer) {
|
||||
MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
|
||||
}
|
||||
for (int i = 0; i < num_layers_; ++i) {
|
||||
weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
|
||||
weight_h_size_ += gate_size * hidden_size_;
|
||||
}
|
||||
weight_size_ = weight_size_ * num_directions_;
|
||||
weight_h_size_ = weight_h_size_ * num_directions_;
|
||||
if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) {
|
||||
MS_LOG(EXCEPTION) << "Error iteration shape!";
|
||||
}
|
||||
if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
|
||||
MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
|
||||
}
|
||||
}
|
||||
|
||||
bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
using dt = dnnl::memory::data_type;
|
||||
using tag = dnnl::memory::format_tag;
|
||||
auto eng = MKLKernelEngine::Get().engine();
|
||||
auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng);
|
||||
auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng);
|
||||
user_weights_memory.set_data_handle(inputs[3]->addr);
|
||||
user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
|
||||
Reorder(&user_weights_memory, &weights_memory);
|
||||
Reorder(&user_weights_h_memory, &weights_h_memory);
|
||||
auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng);
|
||||
if (has_bias_) {
|
||||
bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
|
||||
} else {
|
||||
if (memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0,
|
||||
prim_desc_.bias_desc().get_size())) {
|
||||
MS_LOG(EXCEPTION) << "Bias memset error";
|
||||
}
|
||||
}
|
||||
// set handle
|
||||
SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr);
|
||||
if (is_training) {
|
||||
SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr);
|
||||
}
|
||||
ExecutePrimitive();
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,76 +1,76 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
|
||||
#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
|
||||
#define PLATFORM_86
|
||||
#endif
|
||||
#ifdef PLATFORM_86
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class LstmCPUKernel : public MKLCPUKernel {
|
||||
public:
|
||||
LstmCPUKernel() = default;
|
||||
~LstmCPUKernel() override = default;
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
protected:
|
||||
void InitInputOutputSize(const CNodePtr &kernel_node) override;
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
int weight_size_ = 0;
|
||||
int weight_h_size_ = 0;
|
||||
int input_size_;
|
||||
int hidden_size_;
|
||||
int num_layers_;
|
||||
int batch_size_;
|
||||
int seq_len_;
|
||||
int num_directions_;
|
||||
bool bidirectional_;
|
||||
bool has_bias_;
|
||||
size_t reserve_size_;
|
||||
bool is_training;
|
||||
dnnl::memory::dims weights_dims_;
|
||||
dnnl::memory::dims weights_h_dims_;
|
||||
dnnl::memory::dims bias_dims_;
|
||||
dnnl::lstm_forward::primitive_desc prim_desc_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(LSTM,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
LstmCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
|
||||
#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
|
||||
#define PLATFORM_86
|
||||
#endif
|
||||
#ifdef PLATFORM_86
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class LstmCPUKernel : public MKLCPUKernel {
|
||||
public:
|
||||
LstmCPUKernel() = default;
|
||||
~LstmCPUKernel() override = default;
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
protected:
|
||||
void InitInputOutputSize(const CNodePtr &kernel_node) override;
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
int weight_size_ = 0;
|
||||
int weight_h_size_ = 0;
|
||||
int input_size_;
|
||||
int hidden_size_;
|
||||
int num_layers_;
|
||||
int batch_size_;
|
||||
int seq_len_;
|
||||
int num_directions_;
|
||||
bool bidirectional_;
|
||||
bool has_bias_;
|
||||
size_t reserve_size_;
|
||||
bool is_training;
|
||||
dnnl::memory::dims weights_dims_;
|
||||
dnnl::memory::dims weights_h_dims_;
|
||||
dnnl::memory::dims bias_dims_;
|
||||
dnnl::lstm_forward::primitive_desc prim_desc_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(LSTM,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
LstmCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H
|
||||
|
|
|
@ -1,218 +1,218 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h"
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include "utils/ms_utils.h"
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
const int kMaxLSTMLayer = 100;
|
||||
const int kInputWorkSpaceIndex = 10;
|
||||
void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
|
||||
CPUKernel::InitInputOutputSize(kernel_node);
|
||||
input_size_list_[kInputWorkSpaceIndex] = reserve_size_;
|
||||
}
|
||||
|
||||
void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
using tag = dnnl::memory::format_tag;
|
||||
using dim = dnnl::memory::dims;
|
||||
CheckParam(kernel_node);
|
||||
auto eng = MKLKernelEngine::Get().engine();
|
||||
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
|
||||
if (bidirectional_) {
|
||||
direction = dnnl::rnn_direction::bidirectional_concat;
|
||||
}
|
||||
dim src_dims = {seq_len_, batch_size_, input_size_};
|
||||
dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
|
||||
weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
|
||||
bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
|
||||
dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
|
||||
dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
|
||||
dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
|
||||
dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
|
||||
dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
|
||||
dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
|
||||
dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
|
||||
dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
|
||||
auto forward_desc = std::make_shared<dnnl::lstm_forward::desc>(
|
||||
dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
|
||||
formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc,
|
||||
dst_c_desc);
|
||||
auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng);
|
||||
auto backward_desc = std::make_shared<dnnl::lstm_backward::desc>(
|
||||
dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
|
||||
formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
|
||||
src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc,
|
||||
dst_h_desc, dst_c_desc);
|
||||
prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc);
|
||||
primitive_ = std::make_shared<dnnl::lstm_backward>(prim_backward_desc_);
|
||||
reserve_size_ = static_cast<size_t>(prim_forward_desc.workspace_desc().get_size());
|
||||
AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc());
|
||||
AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
|
||||
}
|
||||
|
||||
void LSTMGradCPUKernel::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
|
||||
const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
|
||||
const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
|
||||
const dnnl::memory::desc &dst_c_desc) {
|
||||
AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
|
||||
AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc());
|
||||
AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc());
|
||||
AddArgument(DNNL_ARG_BIAS, bias_desc);
|
||||
AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
|
||||
AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
|
||||
AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc());
|
||||
AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc());
|
||||
AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc);
|
||||
}
|
||||
|
||||
void LSTMGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
|
||||
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
|
||||
std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
|
||||
bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
|
||||
input_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size");
|
||||
hidden_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size");
|
||||
num_layers_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers");
|
||||
has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
|
||||
batch_size_ = SizeToInt(src_shape[1]);
|
||||
seq_len_ = SizeToInt(src_shape[0]);
|
||||
num_directions_ = 1;
|
||||
if (bidirectional_) {
|
||||
num_directions_ = 2;
|
||||
}
|
||||
const int64_t gate_size = 4 * hidden_size_;
|
||||
if (num_layers_ <= 0) {
|
||||
MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
|
||||
}
|
||||
if (num_layers_ > kMaxLSTMLayer) {
|
||||
MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
|
||||
}
|
||||
for (int64_t i = 0; i < num_layers_; ++i) {
|
||||
weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
|
||||
weight_h_size_ += gate_size * hidden_size_;
|
||||
}
|
||||
weight_size_ = weight_size_ * num_directions_;
|
||||
weight_h_size_ = weight_h_size_ * num_directions_;
|
||||
if (num_directions_ * num_layers_ != SizeToLong(src_h_shape[0])) {
|
||||
MS_LOG(EXCEPTION) << "Error iteration shape!";
|
||||
}
|
||||
if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
|
||||
MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
|
||||
}
|
||||
}
|
||||
|
||||
void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &outputs,
|
||||
const dnnl::memory &weights_memory, const dnnl::memory &weights_h_memory,
|
||||
const dnnl::memory &bias_memory, const dnnl::memory &diff_weights_memory,
|
||||
const dnnl::memory &diff_weights_h_memory,
|
||||
const dnnl::memory &diff_bias_memory) {
|
||||
SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr);
|
||||
}
|
||||
|
||||
void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const {
|
||||
if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) {
|
||||
MS_LOG(EXCEPTION) << name << " memset error";
|
||||
}
|
||||
}
|
||||
|
||||
bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
using dt = dnnl::memory::data_type;
|
||||
using tag = dnnl::memory::format_tag;
|
||||
auto eng = MKLKernelEngine::Get().engine();
|
||||
// construct fw memory
|
||||
auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng);
|
||||
auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng);
|
||||
auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng);
|
||||
user_weights_memory.set_data_handle(inputs[3]->addr);
|
||||
user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
|
||||
Reorder(&user_weights_memory, &weights_memory);
|
||||
Reorder(&user_weights_h_memory, &weights_h_memory);
|
||||
if (has_bias_) {
|
||||
bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
|
||||
} else {
|
||||
if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0,
|
||||
prim_backward_desc_.bias_desc().get_size())) {
|
||||
MS_LOG(EXCEPTION) << "Bias memset error";
|
||||
}
|
||||
}
|
||||
// construct bw memory
|
||||
auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng);
|
||||
auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng);
|
||||
auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng);
|
||||
auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
user_diff_weights_memory.set_data_handle(outputs[3]->addr);
|
||||
user_diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
|
||||
ResetMemory(user_diff_weights_memory, "user weights grad");
|
||||
ResetMemory(user_diff_weights_h_memory, "user weights iter grad");
|
||||
ResetMemory(diff_weights_memory, "weights grad");
|
||||
ResetMemory(diff_weights_h_memory, "weights iter grad");
|
||||
if (has_bias_) {
|
||||
diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
|
||||
}
|
||||
if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0,
|
||||
prim_backward_desc_.diff_bias_desc().get_size())) {
|
||||
MS_LOG(EXCEPTION) << "Bias grad memset error";
|
||||
}
|
||||
SetArgumentHandleOp(inputs, outputs, weights_memory, weights_h_memory, bias_memory, diff_weights_memory,
|
||||
diff_weights_h_memory, diff_bias_memory);
|
||||
ExecutePrimitive();
|
||||
Reorder(&diff_weights_memory, &user_diff_weights_memory);
|
||||
Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory);
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h"
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include "utils/ms_utils.h"
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
const int kMaxLSTMLayer = 100;
|
||||
const int kInputWorkSpaceIndex = 10;
|
||||
void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
|
||||
CPUKernel::InitInputOutputSize(kernel_node);
|
||||
input_size_list_[kInputWorkSpaceIndex] = reserve_size_;
|
||||
}
|
||||
|
||||
void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
using tag = dnnl::memory::format_tag;
|
||||
using dim = dnnl::memory::dims;
|
||||
CheckParam(kernel_node);
|
||||
auto eng = MKLKernelEngine::Get().engine();
|
||||
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
|
||||
if (bidirectional_) {
|
||||
direction = dnnl::rnn_direction::bidirectional_concat;
|
||||
}
|
||||
dim src_dims = {seq_len_, batch_size_, input_size_};
|
||||
dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
|
||||
weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
|
||||
bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
|
||||
dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
|
||||
dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
|
||||
dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
|
||||
dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
|
||||
dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
|
||||
dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
|
||||
dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
|
||||
dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
|
||||
dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
|
||||
auto forward_desc = std::make_shared<dnnl::lstm_forward::desc>(
|
||||
dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
|
||||
formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc,
|
||||
dst_c_desc);
|
||||
auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng);
|
||||
auto backward_desc = std::make_shared<dnnl::lstm_backward::desc>(
|
||||
dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
|
||||
formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
|
||||
src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc,
|
||||
dst_h_desc, dst_c_desc);
|
||||
prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc);
|
||||
primitive_ = std::make_shared<dnnl::lstm_backward>(prim_backward_desc_);
|
||||
reserve_size_ = static_cast<size_t>(prim_forward_desc.workspace_desc().get_size());
|
||||
AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc());
|
||||
AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
|
||||
}
|
||||
|
||||
void LSTMGradCPUKernel::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
|
||||
const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
|
||||
const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
|
||||
const dnnl::memory::desc &dst_c_desc) {
|
||||
AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
|
||||
AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc());
|
||||
AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc());
|
||||
AddArgument(DNNL_ARG_BIAS, bias_desc);
|
||||
AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
|
||||
AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
|
||||
AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc());
|
||||
AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc());
|
||||
AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc);
|
||||
}
|
||||
|
||||
void LSTMGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
|
||||
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
|
||||
std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
|
||||
bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
|
||||
input_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size");
|
||||
hidden_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size");
|
||||
num_layers_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers");
|
||||
has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
|
||||
batch_size_ = SizeToInt(src_shape[1]);
|
||||
seq_len_ = SizeToInt(src_shape[0]);
|
||||
num_directions_ = 1;
|
||||
if (bidirectional_) {
|
||||
num_directions_ = 2;
|
||||
}
|
||||
const int64_t gate_size = 4 * hidden_size_;
|
||||
if (num_layers_ <= 0) {
|
||||
MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
|
||||
}
|
||||
if (num_layers_ > kMaxLSTMLayer) {
|
||||
MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
|
||||
}
|
||||
for (int64_t i = 0; i < num_layers_; ++i) {
|
||||
weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
|
||||
weight_h_size_ += gate_size * hidden_size_;
|
||||
}
|
||||
weight_size_ = weight_size_ * num_directions_;
|
||||
weight_h_size_ = weight_h_size_ * num_directions_;
|
||||
if (num_directions_ * num_layers_ != SizeToLong(src_h_shape[0])) {
|
||||
MS_LOG(EXCEPTION) << "Error iteration shape!";
|
||||
}
|
||||
if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
|
||||
MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
|
||||
}
|
||||
}
|
||||
|
||||
void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &outputs,
|
||||
const dnnl::memory &weights_memory, const dnnl::memory &weights_h_memory,
|
||||
const dnnl::memory &bias_memory, const dnnl::memory &diff_weights_memory,
|
||||
const dnnl::memory &diff_weights_h_memory,
|
||||
const dnnl::memory &diff_bias_memory) {
|
||||
SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle());
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr);
|
||||
}
|
||||
|
||||
void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const {
|
||||
if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) {
|
||||
MS_LOG(EXCEPTION) << name << " memset error";
|
||||
}
|
||||
}
|
||||
|
||||
bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
using dt = dnnl::memory::data_type;
|
||||
using tag = dnnl::memory::format_tag;
|
||||
auto eng = MKLKernelEngine::Get().engine();
|
||||
// construct fw memory
|
||||
auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng);
|
||||
auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng);
|
||||
auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng);
|
||||
user_weights_memory.set_data_handle(inputs[3]->addr);
|
||||
user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
|
||||
Reorder(&user_weights_memory, &weights_memory);
|
||||
Reorder(&user_weights_h_memory, &weights_h_memory);
|
||||
if (has_bias_) {
|
||||
bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
|
||||
} else {
|
||||
if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0,
|
||||
prim_backward_desc_.bias_desc().get_size())) {
|
||||
MS_LOG(EXCEPTION) << "Bias memset error";
|
||||
}
|
||||
}
|
||||
// construct bw memory
|
||||
auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng);
|
||||
auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng);
|
||||
auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng);
|
||||
auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
|
||||
user_diff_weights_memory.set_data_handle(outputs[3]->addr);
|
||||
user_diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
|
||||
ResetMemory(user_diff_weights_memory, "user weights grad");
|
||||
ResetMemory(user_diff_weights_h_memory, "user weights iter grad");
|
||||
ResetMemory(diff_weights_memory, "weights grad");
|
||||
ResetMemory(diff_weights_h_memory, "weights iter grad");
|
||||
if (has_bias_) {
|
||||
diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
|
||||
}
|
||||
if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0,
|
||||
prim_backward_desc_.diff_bias_desc().get_size())) {
|
||||
MS_LOG(EXCEPTION) << "Bias grad memset error";
|
||||
}
|
||||
SetArgumentHandleOp(inputs, outputs, weights_memory, weights_h_memory, bias_memory, diff_weights_memory,
|
||||
diff_weights_h_memory, diff_bias_memory);
|
||||
ExecutePrimitive();
|
||||
Reorder(&diff_weights_memory, &user_diff_weights_memory);
|
||||
Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory);
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,87 +1,87 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class LSTMGradCPUKernel : public MKLCPUKernel {
|
||||
public:
|
||||
LSTMGradCPUKernel() = default;
|
||||
~LSTMGradCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
protected:
|
||||
void InitInputOutputSize(const CNodePtr &kernel_node) override;
|
||||
|
||||
private:
|
||||
void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
|
||||
const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
|
||||
const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
|
||||
const dnnl::memory::desc &dst_c_desc);
|
||||
void SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &outputs, const dnnl::memory &weights_memory,
|
||||
const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory,
|
||||
const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory,
|
||||
const dnnl::memory &diff_bias_memory);
|
||||
void ResetMemory(const dnnl::memory &mem, const string name) const;
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
int64_t weight_size_ = 0;
|
||||
int64_t weight_h_size_ = 0;
|
||||
int64_t input_size_;
|
||||
int64_t hidden_size_;
|
||||
int64_t num_layers_;
|
||||
int64_t batch_size_;
|
||||
int64_t seq_len_;
|
||||
int num_directions_;
|
||||
bool bidirectional_;
|
||||
bool has_bias_;
|
||||
size_t reserve_size_;
|
||||
dnnl::memory::dims weights_dims_;
|
||||
dnnl::memory::dims weights_h_dims_;
|
||||
dnnl::memory::dims bias_dims_;
|
||||
dnnl::lstm_backward::primitive_desc prim_backward_desc_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(LSTMGrad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
LSTMGradCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class LSTMGradCPUKernel : public MKLCPUKernel {
|
||||
public:
|
||||
LSTMGradCPUKernel() = default;
|
||||
~LSTMGradCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
protected:
|
||||
void InitInputOutputSize(const CNodePtr &kernel_node) override;
|
||||
|
||||
private:
|
||||
void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
|
||||
const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
|
||||
const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
|
||||
const dnnl::memory::desc &dst_c_desc);
|
||||
void SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &outputs, const dnnl::memory &weights_memory,
|
||||
const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory,
|
||||
const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory,
|
||||
const dnnl::memory &diff_bias_memory);
|
||||
void ResetMemory(const dnnl::memory &mem, const string name) const;
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
int64_t weight_size_ = 0;
|
||||
int64_t weight_h_size_ = 0;
|
||||
int64_t input_size_;
|
||||
int64_t hidden_size_;
|
||||
int64_t num_layers_;
|
||||
int64_t batch_size_;
|
||||
int64_t seq_len_;
|
||||
int num_directions_;
|
||||
bool bidirectional_;
|
||||
bool has_bias_;
|
||||
size_t reserve_size_;
|
||||
dnnl::memory::dims weights_dims_;
|
||||
dnnl::memory::dims weights_h_dims_;
|
||||
dnnl::memory::dims bias_dims_;
|
||||
dnnl::lstm_backward::primitive_desc prim_backward_desc_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(LSTMGrad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
LSTMGradCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
|
||||
|
|
|
@ -1,99 +1,99 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h"
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
#include <cmath>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "utils/ms_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
|
||||
CPUKernel::InitInputOutputSize(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
size_t type_size = sizeof(float);
|
||||
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
|
||||
workspace_size_list_.emplace_back(tensor_size);
|
||||
}
|
||||
|
||||
void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
dnnl::memory::dims mem_dims;
|
||||
mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
|
||||
if (mem_dims.size() != 2) {
|
||||
MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size();
|
||||
}
|
||||
batch_size_ = shape[0];
|
||||
class_num_ = shape[1];
|
||||
if (batch_size_ == 0 || class_num_ == 0) {
|
||||
MS_LOG(EXCEPTION) << "Invalid batch size or class num input!";
|
||||
}
|
||||
dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc);
|
||||
|
||||
dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1);
|
||||
auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
|
||||
primitive_ = std::make_shared<dnnl::softmax_forward>(prim_desc);
|
||||
|
||||
AddArgument(DNNL_ARG_SRC, mem_desc);
|
||||
AddArgument(DNNL_ARG_DST, mem_desc);
|
||||
}
|
||||
|
||||
void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels,
|
||||
float *output1, float *output2) const {
|
||||
float epsilon = 1e-6;
|
||||
for (size_t i = 0; i < batch_size_; ++i) {
|
||||
output1[i] = 0;
|
||||
float loss = 0.0;
|
||||
for (size_t j = 0; j < class_num_; ++j) {
|
||||
float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]);
|
||||
output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j];
|
||||
loss += labels[i * class_num_ + j] * logit;
|
||||
}
|
||||
output1[i] = -loss;
|
||||
}
|
||||
}
|
||||
|
||||
bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (inputs.empty() || workspace.empty() || outputs.empty()) {
|
||||
MS_LOG(EXCEPTION) << "Error input output size!";
|
||||
}
|
||||
size_t batch_float_size = batch_size_ * sizeof(float);
|
||||
size_t batch_class_float_size = class_num_ * batch_float_size;
|
||||
if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size ||
|
||||
inputs[1]->size != batch_class_float_size) {
|
||||
MS_LOG(EXCEPTION) << "Error input data size!";
|
||||
}
|
||||
if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) {
|
||||
MS_LOG(EXCEPTION) << "Error output data size!";
|
||||
}
|
||||
SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr);
|
||||
ExecutePrimitive();
|
||||
auto labels = reinterpret_cast<float *>(inputs[1]->addr);
|
||||
auto logits = reinterpret_cast<float *>(workspace[0]->addr);
|
||||
auto output1 = reinterpret_cast<float *>(outputs[0]->addr);
|
||||
auto output2 = reinterpret_cast<float *>(outputs[1]->addr);
|
||||
ForwardPostExecute(logits, labels, output1, output2);
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h"
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
#include <cmath>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "utils/ms_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
|
||||
CPUKernel::InitInputOutputSize(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
size_t type_size = sizeof(float);
|
||||
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
|
||||
workspace_size_list_.emplace_back(tensor_size);
|
||||
}
|
||||
|
||||
void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
dnnl::memory::dims mem_dims;
|
||||
mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
|
||||
if (mem_dims.size() != 2) {
|
||||
MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size();
|
||||
}
|
||||
batch_size_ = shape[0];
|
||||
class_num_ = shape[1];
|
||||
if (batch_size_ == 0 || class_num_ == 0) {
|
||||
MS_LOG(EXCEPTION) << "Invalid batch size or class num input!";
|
||||
}
|
||||
dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc);
|
||||
|
||||
dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1);
|
||||
auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
|
||||
primitive_ = std::make_shared<dnnl::softmax_forward>(prim_desc);
|
||||
|
||||
AddArgument(DNNL_ARG_SRC, mem_desc);
|
||||
AddArgument(DNNL_ARG_DST, mem_desc);
|
||||
}
|
||||
|
||||
void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels,
|
||||
float *output1, float *output2) const {
|
||||
float epsilon = 1e-6;
|
||||
for (size_t i = 0; i < batch_size_; ++i) {
|
||||
output1[i] = 0;
|
||||
float loss = 0.0;
|
||||
for (size_t j = 0; j < class_num_; ++j) {
|
||||
float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]);
|
||||
output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j];
|
||||
loss += labels[i * class_num_ + j] * logit;
|
||||
}
|
||||
output1[i] = -loss;
|
||||
}
|
||||
}
|
||||
|
||||
bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (inputs.empty() || workspace.empty() || outputs.empty()) {
|
||||
MS_LOG(EXCEPTION) << "Error input output size!";
|
||||
}
|
||||
size_t batch_float_size = batch_size_ * sizeof(float);
|
||||
size_t batch_class_float_size = class_num_ * batch_float_size;
|
||||
if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size ||
|
||||
inputs[1]->size != batch_class_float_size) {
|
||||
MS_LOG(EXCEPTION) << "Error input data size!";
|
||||
}
|
||||
if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) {
|
||||
MS_LOG(EXCEPTION) << "Error output data size!";
|
||||
}
|
||||
SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr);
|
||||
ExecutePrimitive();
|
||||
auto labels = reinterpret_cast<float *>(inputs[1]->addr);
|
||||
auto logits = reinterpret_cast<float *>(workspace[0]->addr);
|
||||
auto output1 = reinterpret_cast<float *>(outputs[0]->addr);
|
||||
auto output2 = reinterpret_cast<float *>(outputs[1]->addr);
|
||||
ForwardPostExecute(logits, labels, output1, output2);
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,53 +1,53 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel {
|
||||
public:
|
||||
SoftmaxCrossEntropyWithLogitsCPUKernel() = default;
|
||||
~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
protected:
|
||||
void InitInputOutputSize(const CNodePtr &kernel_node) override;
|
||||
|
||||
private:
|
||||
void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const;
|
||||
size_t class_num_{0};
|
||||
size_t batch_size_{0};
|
||||
};
|
||||
MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
SoftmaxCrossEntropyWithLogitsCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel {
|
||||
public:
|
||||
SoftmaxCrossEntropyWithLogitsCPUKernel() = default;
|
||||
~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
protected:
|
||||
void InitInputOutputSize(const CNodePtr &kernel_node) override;
|
||||
|
||||
private:
|
||||
void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const;
|
||||
size_t class_num_{0};
|
||||
size_t batch_size_{0};
|
||||
};
|
||||
MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
SoftmaxCrossEntropyWithLogitsCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
|
||||
|
|
|
@ -1,59 +1,59 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/kernel.h"
|
||||
#include "ps/util.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace ps {
|
||||
using mindspore::ps::Util;
|
||||
class PServerKernel {
|
||||
public:
|
||||
PServerKernel(size_t rank_id, size_t pserver_num, size_t worker_num)
|
||||
: rank_id_(rank_id), pserver_num_(pserver_num), worker_num_(worker_num) {}
|
||||
~PServerKernel() = default;
|
||||
PServerKernel(const PServerKernel &) = delete;
|
||||
PServerKernel &operator=(const PServerKernel &) = delete;
|
||||
virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
|
||||
virtual void InitKernel(const CNodePtr &cnode,
|
||||
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
|
||||
virtual void ReInit(const std::vector<std::vector<size_t>> &) {}
|
||||
virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) = 0;
|
||||
virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals,
|
||||
size_t ids_size) {}
|
||||
virtual const std::vector<size_t> &input_sizes() const = 0;
|
||||
virtual const std::vector<size_t> &output_sizes() const = 0;
|
||||
virtual const std::vector<size_t> &workspace_sizes() const = 0;
|
||||
|
||||
protected:
|
||||
virtual void ReInit(const std::vector<AddressPtr> &) {}
|
||||
void Shard(std::vector<size_t> *shape, int axis);
|
||||
|
||||
size_t rank_id_;
|
||||
size_t pserver_num_;
|
||||
size_t worker_num_;
|
||||
};
|
||||
} // namespace ps
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/kernel.h"
|
||||
#include "ps/util.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace ps {
|
||||
using mindspore::ps::Util;
|
||||
class PServerKernel {
|
||||
public:
|
||||
PServerKernel(size_t rank_id, size_t pserver_num, size_t worker_num)
|
||||
: rank_id_(rank_id), pserver_num_(pserver_num), worker_num_(worker_num) {}
|
||||
~PServerKernel() = default;
|
||||
PServerKernel(const PServerKernel &) = delete;
|
||||
PServerKernel &operator=(const PServerKernel &) = delete;
|
||||
virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
|
||||
virtual void InitKernel(const CNodePtr &cnode,
|
||||
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
|
||||
virtual void ReInit(const std::vector<std::vector<size_t>> &) {}
|
||||
virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) = 0;
|
||||
virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals,
|
||||
size_t ids_size) {}
|
||||
virtual const std::vector<size_t> &input_sizes() const = 0;
|
||||
virtual const std::vector<size_t> &output_sizes() const = 0;
|
||||
virtual const std::vector<size_t> &workspace_sizes() const = 0;
|
||||
|
||||
protected:
|
||||
virtual void ReInit(const std::vector<AddressPtr> &) {}
|
||||
void Shard(std::vector<size_t> *shape, int axis);
|
||||
|
||||
size_t rank_id_;
|
||||
size_t pserver_num_;
|
||||
size_t worker_num_;
|
||||
};
|
||||
} // namespace ps
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
|
||||
|
|
|
@ -1,138 +1,138 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS);
|
||||
if (axis_addr->isa<ValueTuple>() || axis_addr->isa<ValueList>()) {
|
||||
axis_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, AXIS);
|
||||
} else if (axis_addr->isa<Int64Imm>()) {
|
||||
axis_.emplace_back(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Attribute is invalid";
|
||||
}
|
||||
|
||||
int dimension = input_shape_.size();
|
||||
std::transform(axis_.begin(), axis_.end(), axis_.begin(),
|
||||
[dimension](const auto &a) { return a < 0 ? dimension + a : a; });
|
||||
sort(axis_.begin(), axis_.end());
|
||||
// Delete the duplicate axis.
|
||||
auto last = std::unique(axis_.begin(), axis_.end());
|
||||
axis_.erase(last, axis_.end());
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
|
||||
if constexpr (std::is_same<T, bool>::value) {
|
||||
if (kernel_name == "ReduceAll") {
|
||||
reduce_type_ = kReduceAll;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; };
|
||||
} else if (kernel_name == "ReduceAny") {
|
||||
reduce_type_ = kReduceAny;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; };
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool.";
|
||||
}
|
||||
} else {
|
||||
if (kernel_name == "ReduceMax") {
|
||||
reduce_type_ = kReduceMax;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); };
|
||||
} else if (kernel_name == "ReduceMin") {
|
||||
reduce_type_ = kReduceMin;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); };
|
||||
} else if (kernel_name == "ReduceSum") {
|
||||
reduce_type_ = kReduceSum;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
|
||||
} else if (kernel_name == "ReduceMean") {
|
||||
reduce_type_ = kReduceMean;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
size_t input_size = inputs[0]->size / sizeof(T);
|
||||
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) {
|
||||
// Get one ret
|
||||
*output_addr = input_addr[0];
|
||||
for (size_t i = 1; i < input_size; ++i) {
|
||||
reduce_func_(input_addr, i, output_addr);
|
||||
}
|
||||
if (reduce_type_ == kReduceMean) {
|
||||
*output_addr /= input_size;
|
||||
}
|
||||
} else {
|
||||
// Calculate transpose axes and stride
|
||||
int dimension = input_shape_.size();
|
||||
size_t stride = 1;
|
||||
std::vector<size_t> axes(input_shape_.size());
|
||||
size_t j = 0;
|
||||
size_t k = 0;
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
if (j == axis_.size() || i != axis_[j]) {
|
||||
axes[k] = i;
|
||||
++k;
|
||||
} else {
|
||||
stride *= input_shape_[i];
|
||||
++j;
|
||||
}
|
||||
}
|
||||
for (auto &it : axis_) {
|
||||
axes[k] = it;
|
||||
++k;
|
||||
}
|
||||
// Calculate transpose shape
|
||||
std::vector<size_t> transpose_shape(input_shape_.size());
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
transpose_shape[i] = input_shape_[axes[i]];
|
||||
}
|
||||
size_t output_size = outputs[0]->size / sizeof(T);
|
||||
TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_);
|
||||
auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start * stride);
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
output_addr[i] = input_addr[iter.GetPos()];
|
||||
iter.GenNextPos();
|
||||
for (size_t j = 1; j < stride; ++j) {
|
||||
reduce_func_(input_addr, iter.GetPos(), &output_addr[i]);
|
||||
iter.GenNextPos();
|
||||
}
|
||||
if (reduce_type_ == kReduceMean) {
|
||||
output_addr[i] /= stride;
|
||||
}
|
||||
}
|
||||
};
|
||||
CPUKernelUtils::ParallelFor(task, output_size);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS);
|
||||
if (axis_addr->isa<ValueTuple>() || axis_addr->isa<ValueList>()) {
|
||||
axis_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, AXIS);
|
||||
} else if (axis_addr->isa<Int64Imm>()) {
|
||||
axis_.emplace_back(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Attribute is invalid";
|
||||
}
|
||||
|
||||
int dimension = input_shape_.size();
|
||||
std::transform(axis_.begin(), axis_.end(), axis_.begin(),
|
||||
[dimension](const auto &a) { return a < 0 ? dimension + a : a; });
|
||||
sort(axis_.begin(), axis_.end());
|
||||
// Delete the duplicate axis.
|
||||
auto last = std::unique(axis_.begin(), axis_.end());
|
||||
axis_.erase(last, axis_.end());
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
|
||||
if constexpr (std::is_same<T, bool>::value) {
|
||||
if (kernel_name == "ReduceAll") {
|
||||
reduce_type_ = kReduceAll;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; };
|
||||
} else if (kernel_name == "ReduceAny") {
|
||||
reduce_type_ = kReduceAny;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; };
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool.";
|
||||
}
|
||||
} else {
|
||||
if (kernel_name == "ReduceMax") {
|
||||
reduce_type_ = kReduceMax;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); };
|
||||
} else if (kernel_name == "ReduceMin") {
|
||||
reduce_type_ = kReduceMin;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); };
|
||||
} else if (kernel_name == "ReduceSum") {
|
||||
reduce_type_ = kReduceSum;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
|
||||
} else if (kernel_name == "ReduceMean") {
|
||||
reduce_type_ = kReduceMean;
|
||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
size_t input_size = inputs[0]->size / sizeof(T);
|
||||
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) {
|
||||
// Get one ret
|
||||
*output_addr = input_addr[0];
|
||||
for (size_t i = 1; i < input_size; ++i) {
|
||||
reduce_func_(input_addr, i, output_addr);
|
||||
}
|
||||
if (reduce_type_ == kReduceMean) {
|
||||
*output_addr /= input_size;
|
||||
}
|
||||
} else {
|
||||
// Calculate transpose axes and stride
|
||||
int dimension = input_shape_.size();
|
||||
size_t stride = 1;
|
||||
std::vector<size_t> axes(input_shape_.size());
|
||||
size_t j = 0;
|
||||
size_t k = 0;
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
if (j == axis_.size() || i != axis_[j]) {
|
||||
axes[k] = i;
|
||||
++k;
|
||||
} else {
|
||||
stride *= input_shape_[i];
|
||||
++j;
|
||||
}
|
||||
}
|
||||
for (auto &it : axis_) {
|
||||
axes[k] = it;
|
||||
++k;
|
||||
}
|
||||
// Calculate transpose shape
|
||||
std::vector<size_t> transpose_shape(input_shape_.size());
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
transpose_shape[i] = input_shape_[axes[i]];
|
||||
}
|
||||
size_t output_size = outputs[0]->size / sizeof(T);
|
||||
TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_);
|
||||
auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start * stride);
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
output_addr[i] = input_addr[iter.GetPos()];
|
||||
iter.GenNextPos();
|
||||
for (size_t j = 1; j < stride; ++j) {
|
||||
reduce_func_(input_addr, iter.GetPos(), &output_addr[i]);
|
||||
iter.GenNextPos();
|
||||
}
|
||||
if (reduce_type_ == kReduceMean) {
|
||||
output_addr[i] /= stride;
|
||||
}
|
||||
}
|
||||
};
|
||||
CPUKernelUtils::ParallelFor(task, output_size);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,69 +1,69 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
class ReduceCPUKernel : public CPUKernel {
|
||||
public:
|
||||
ReduceCPUKernel() = default;
|
||||
~ReduceCPUKernel() override = default;
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
enum ReduceType { kReduceAll, kReduceAny, kReduceMax, kReduceMin, kReduceSum, kReduceMean };
|
||||
std::vector<size_t> input_shape_;
|
||||
std::vector<int64_t> axis_;
|
||||
ReduceType reduce_type_{kReduceAll};
|
||||
std::function<void(const T *, size_t, T *)> reduce_func_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, double);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int32_t);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, float);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, double);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int32_t);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, float);
|
||||
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, double);
|
||||
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int32_t);
|
||||
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, float);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, double);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int32_t);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceAll, KernelAttr(), ReduceCPUKernel, bool);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceAny, KernelAttr(), ReduceCPUKernel, bool);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
class ReduceCPUKernel : public CPUKernel {
|
||||
public:
|
||||
ReduceCPUKernel() = default;
|
||||
~ReduceCPUKernel() override = default;
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
enum ReduceType { kReduceAll, kReduceAny, kReduceMax, kReduceMin, kReduceSum, kReduceMean };
|
||||
std::vector<size_t> input_shape_;
|
||||
std::vector<int64_t> axis_;
|
||||
ReduceType reduce_type_{kReduceAll};
|
||||
std::function<void(const T *, size_t, T *)> reduce_func_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, double);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int32_t);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, float);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, double);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int32_t);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, float);
|
||||
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, double);
|
||||
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int32_t);
|
||||
MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, float);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, double);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int32_t);
|
||||
MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceAll, KernelAttr(), ReduceCPUKernel, bool);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(ReduceAny, KernelAttr(), ReduceCPUKernel, bool);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
|
||||
|
|
|
@ -1,91 +1,91 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
void SpaceToDepthCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
CheckParam(kernel_node);
|
||||
|
||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> & /*workspace*/,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
size_t size = IntToSize(inputs[0]->size / sizeof(T));
|
||||
|
||||
std::vector<size_t> input_shape = input_shape_;
|
||||
std::vector<size_t> output_shape = output_shape_;
|
||||
size_t block_size = block_size_;
|
||||
size_t input_dimension = input_shape.size();
|
||||
size_t input_strides[3] = {1, 1, 1};
|
||||
|
||||
for (size_t i = input_dimension - 1; i >= 1; --i) {
|
||||
for (size_t j = 0; j < i; ++j) {
|
||||
input_strides[j] *= input_shape[i];
|
||||
}
|
||||
}
|
||||
|
||||
auto task = [&, input_addr, output_addr](size_t start, size_t end) {
|
||||
std::vector<size_t> input_pos_array(input_dimension, 0);
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
size_t tmp_pos = i;
|
||||
for (size_t j = 0; j < input_dimension - 1; ++j) {
|
||||
input_pos_array[j] = tmp_pos / input_strides[j];
|
||||
tmp_pos %= input_strides[j];
|
||||
}
|
||||
input_pos_array.back() = tmp_pos;
|
||||
size_t output_pos = input_pos_array[0];
|
||||
output_pos =
|
||||
(output_pos * output_shape[1]) +
|
||||
(input_pos_array[1] +
|
||||
(block_size * (input_pos_array[2] % block_size) + input_pos_array[3] % block_size) * input_shape[1]);
|
||||
output_pos = (output_pos * output_shape[2]) + (input_pos_array[2] / block_size);
|
||||
output_pos = (output_pos * output_shape[3]) + (input_pos_array[3] / block_size);
|
||||
output_addr[output_pos] = input_addr[i];
|
||||
}
|
||||
};
|
||||
|
||||
CPUKernelUtils::ParallelFor(task, size);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SpaceToDepthCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
void SpaceToDepthCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
CheckParam(kernel_node);
|
||||
|
||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> & /*workspace*/,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
size_t size = IntToSize(inputs[0]->size / sizeof(T));
|
||||
|
||||
std::vector<size_t> input_shape = input_shape_;
|
||||
std::vector<size_t> output_shape = output_shape_;
|
||||
size_t block_size = block_size_;
|
||||
size_t input_dimension = input_shape.size();
|
||||
size_t input_strides[3] = {1, 1, 1};
|
||||
|
||||
for (size_t i = input_dimension - 1; i >= 1; --i) {
|
||||
for (size_t j = 0; j < i; ++j) {
|
||||
input_strides[j] *= input_shape[i];
|
||||
}
|
||||
}
|
||||
|
||||
auto task = [&, input_addr, output_addr](size_t start, size_t end) {
|
||||
std::vector<size_t> input_pos_array(input_dimension, 0);
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
size_t tmp_pos = i;
|
||||
for (size_t j = 0; j < input_dimension - 1; ++j) {
|
||||
input_pos_array[j] = tmp_pos / input_strides[j];
|
||||
tmp_pos %= input_strides[j];
|
||||
}
|
||||
input_pos_array.back() = tmp_pos;
|
||||
size_t output_pos = input_pos_array[0];
|
||||
output_pos =
|
||||
(output_pos * output_shape[1]) +
|
||||
(input_pos_array[1] +
|
||||
(block_size * (input_pos_array[2] % block_size) + input_pos_array[3] % block_size) * input_shape[1]);
|
||||
output_pos = (output_pos * output_shape[2]) + (input_pos_array[2] / block_size);
|
||||
output_pos = (output_pos * output_shape[3]) + (input_pos_array[3] / block_size);
|
||||
output_addr[output_pos] = input_addr[i];
|
||||
}
|
||||
};
|
||||
|
||||
CPUKernelUtils::ParallelFor(task, size);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SpaceToDepthCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,84 +1,84 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
class SpaceToDepthCPUKernel : public CPUKernel {
|
||||
public:
|
||||
SpaceToDepthCPUKernel() = default;
|
||||
~SpaceToDepthCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
std::vector<size_t> input_shape_;
|
||||
std::vector<size_t> output_shape_;
|
||||
size_t block_size_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL_T(
|
||||
SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
SpaceToDepthCPUKernel, float);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(
|
||||
SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
SpaceToDepthCPUKernel, float16);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
|
||||
SpaceToDepthCPUKernel, int8_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
|
||||
SpaceToDepthCPUKernel, int16_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
SpaceToDepthCPUKernel, int);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
||||
SpaceToDepthCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
|
||||
SpaceToDepthCPUKernel, uint8_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
|
||||
SpaceToDepthCPUKernel, uint16_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
|
||||
SpaceToDepthCPUKernel, uint32_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
|
||||
SpaceToDepthCPUKernel, uint64_t);
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
class SpaceToDepthCPUKernel : public CPUKernel {
|
||||
public:
|
||||
SpaceToDepthCPUKernel() = default;
|
||||
~SpaceToDepthCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
std::vector<size_t> input_shape_;
|
||||
std::vector<size_t> output_shape_;
|
||||
size_t block_size_;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL_T(
|
||||
SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
SpaceToDepthCPUKernel, float);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(
|
||||
SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
SpaceToDepthCPUKernel, float16);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
|
||||
SpaceToDepthCPUKernel, int8_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
|
||||
SpaceToDepthCPUKernel, int16_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
SpaceToDepthCPUKernel, int);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
||||
SpaceToDepthCPUKernel, int64_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
|
||||
SpaceToDepthCPUKernel, uint8_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
|
||||
SpaceToDepthCPUKernel, uint16_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
|
||||
SpaceToDepthCPUKernel, uint32_t);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(SpaceToDepth,
|
||||
KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
|
||||
SpaceToDepthCPUKernel, uint64_t);
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
|
||||
|
|
|
@ -1,87 +1,87 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include "backend/kernel_compiler/cpu/topk_cpu_kernel.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
|
||||
if (inputs.size() != 2 || outputs.size() != 2) {
|
||||
MS_LOG(EXCEPTION) << "TopK needs 2 inputs and 2 outputs, but get inputs: " << inputs.size()
|
||||
<< "outputs: " << outputs.size();
|
||||
}
|
||||
if (inputs[0]->size != outer_size_ * inner_size_ * sizeof(T)) {
|
||||
MS_LOG(EXCEPTION) << "Error input data size!";
|
||||
}
|
||||
if (inputs[1]->size != sizeof(int)) {
|
||||
MS_LOG(EXCEPTION) << "Input K must be int!";
|
||||
}
|
||||
auto input = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
int k = reinterpret_cast<int *>(inputs[1]->addr)[0];
|
||||
auto output = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
auto indices = reinterpret_cast<int *>(outputs[1]->addr);
|
||||
if (k < 1) {
|
||||
MS_LOG(EXCEPTION) << "Input k must > 0!";
|
||||
}
|
||||
size_t k_num = IntToSize(std::min<int>(inner_size_, k));
|
||||
if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) {
|
||||
MS_LOG(EXCEPTION) << "Error output data size!";
|
||||
}
|
||||
for (size_t i = 0; i < outer_size_; ++i) {
|
||||
std::vector<size_t> idx(inner_size_);
|
||||
auto base_input = i * inner_size_;
|
||||
std::iota(idx.begin(), idx.end(), base_input);
|
||||
std::stable_sort(idx.begin(), idx.end(),
|
||||
[&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; });
|
||||
auto base_output = i * k_num;
|
||||
if (!sorted_) {
|
||||
std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num));
|
||||
}
|
||||
for (size_t j = 0; j < k_num; ++j) {
|
||||
indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input);
|
||||
output[base_output + j] = input[idx[j]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
for (size_t i = 0; i < x_shape_.size() - 1; ++i) {
|
||||
outer_size_ *= x_shape_[i];
|
||||
}
|
||||
inner_size_ = x_shape_[x_shape_.size() - 1];
|
||||
sorted_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "sorted");
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
}
|
||||
|
||||
bool TopKCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (dtype_ == kNumberTypeFloat16) {
|
||||
LaunchKernel<float16>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat32) {
|
||||
LaunchKernel<float>(inputs, outputs);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include "backend/kernel_compiler/cpu/topk_cpu_kernel.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
|
||||
if (inputs.size() != 2 || outputs.size() != 2) {
|
||||
MS_LOG(EXCEPTION) << "TopK needs 2 inputs and 2 outputs, but get inputs: " << inputs.size()
|
||||
<< "outputs: " << outputs.size();
|
||||
}
|
||||
if (inputs[0]->size != outer_size_ * inner_size_ * sizeof(T)) {
|
||||
MS_LOG(EXCEPTION) << "Error input data size!";
|
||||
}
|
||||
if (inputs[1]->size != sizeof(int)) {
|
||||
MS_LOG(EXCEPTION) << "Input K must be int!";
|
||||
}
|
||||
auto input = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
int k = reinterpret_cast<int *>(inputs[1]->addr)[0];
|
||||
auto output = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
auto indices = reinterpret_cast<int *>(outputs[1]->addr);
|
||||
if (k < 1) {
|
||||
MS_LOG(EXCEPTION) << "Input k must > 0!";
|
||||
}
|
||||
size_t k_num = IntToSize(std::min<int>(inner_size_, k));
|
||||
if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) {
|
||||
MS_LOG(EXCEPTION) << "Error output data size!";
|
||||
}
|
||||
for (size_t i = 0; i < outer_size_; ++i) {
|
||||
std::vector<size_t> idx(inner_size_);
|
||||
auto base_input = i * inner_size_;
|
||||
std::iota(idx.begin(), idx.end(), base_input);
|
||||
std::stable_sort(idx.begin(), idx.end(),
|
||||
[&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; });
|
||||
auto base_output = i * k_num;
|
||||
if (!sorted_) {
|
||||
std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num));
|
||||
}
|
||||
for (size_t j = 0; j < k_num; ++j) {
|
||||
indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input);
|
||||
output[base_output + j] = input[idx[j]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
for (size_t i = 0; i < x_shape_.size() - 1; ++i) {
|
||||
outer_size_ *= x_shape_[i];
|
||||
}
|
||||
inner_size_ = x_shape_[x_shape_.size() - 1];
|
||||
sorted_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "sorted");
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
}
|
||||
|
||||
bool TopKCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (dtype_ == kNumberTypeFloat16) {
|
||||
LaunchKernel<float16>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat32) {
|
||||
LaunchKernel<float>(inputs, outputs);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,46 +1,46 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class TopKCPUKernel : public CPUKernel {
|
||||
public:
|
||||
TopKCPUKernel() = default;
|
||||
~TopKCPUKernel() override = default;
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
size_t outer_size_{1};
|
||||
size_t inner_size_{1};
|
||||
bool sorted_{false};
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(TopK, KernelAttr(), TopKCPUKernel)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class TopKCPUKernel : public CPUKernel {
|
||||
public:
|
||||
TopKCPUKernel() = default;
|
||||
~TopKCPUKernel() override = default;
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
size_t outer_size_{1};
|
||||
size_t inner_size_{1};
|
||||
bool sorted_{false};
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(TopK, KernelAttr(), TopKCPUKernel)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
|
||||
|
|
|
@ -1,159 +1,159 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h"
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "common/thread_pool.h"
|
||||
#include "nnacl/fp32/transpose_fp32.h"
|
||||
#include "nnacl/int8/transpose_int8.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
auto tmp = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm");
|
||||
axes_ = {tmp.begin(), tmp.end()};
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
if (axes_.size() > MAX_TRANSPOSE_DIM_SIZE) {
|
||||
MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_TRANSPOSE_DIM_SIZE << "D, but got "
|
||||
<< axes_.size() << "D.";
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < axes_.size(); ++i) {
|
||||
transpose_param_.perm_[i] = SizeToInt(axes_[i]);
|
||||
}
|
||||
int num_axes = SizeToInt(input_shape_.size());
|
||||
transpose_param_.perm_size_ = axes_.size();
|
||||
transpose_param_.num_axes_ = num_axes;
|
||||
transpose_param_.strides_[num_axes - 1] = 1;
|
||||
transpose_param_.out_strides_[num_axes - 1] = 1;
|
||||
for (int i = num_axes - 2; i >= 0; i--) {
|
||||
transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1];
|
||||
transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1];
|
||||
}
|
||||
launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel<int8_t>;
|
||||
launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel<int16_t>;
|
||||
launch_map_[kNumberTypeInt32] = &TransposeCPUFwdKernel::LaunchKernel<int>;
|
||||
launch_map_[kNumberTypeInt64] = &TransposeCPUFwdKernel::LaunchKernel<int64_t>;
|
||||
launch_map_[kNumberTypeUInt8] = &TransposeCPUFwdKernel::LaunchKernel<uint8_t>;
|
||||
launch_map_[kNumberTypeUInt16] = &TransposeCPUFwdKernel::LaunchKernel<uint16_t>;
|
||||
launch_map_[kNumberTypeUInt32] = &TransposeCPUFwdKernel::LaunchKernel<uint32_t>;
|
||||
launch_map_[kNumberTypeUInt64] = &TransposeCPUFwdKernel::LaunchKernel<uint64_t>;
|
||||
launch_map_[kNumberTypeFloat32] = &TransposeCPUFwdKernel::LaunchKernel<float>;
|
||||
launch_map_[kNumberTypeBool] = &TransposeCPUFwdKernel::LaunchKernel<bool>;
|
||||
|
||||
auto iter = launch_map_.find(dtype_);
|
||||
if (iter != launch_map_.end()) {
|
||||
launch_func_ = iter->second;
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Input data type: " << dtype_ << "is not supported for Transpose kernel on CPU.";
|
||||
}
|
||||
}
|
||||
|
||||
bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
launch_func_(this, inputs, outputs);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
transpose_param_.data_num_ = inputs[0]->size / sizeof(T);
|
||||
int output_shape[SizeToInt(output_shape_.size())];
|
||||
for (size_t i = 0; i < output_shape_.size(); ++i) {
|
||||
output_shape[i] = SizeToInt(output_shape_[i]);
|
||||
}
|
||||
size_t data_count = (inputs[0]->size) / sizeof(T);
|
||||
if (axes_.size() <= DIMENSION_6D && data_count < MAX_TRANSPOSE_SERIAL_SIZE) {
|
||||
int res = NNACL_ERR;
|
||||
if constexpr (std::is_same_v<T, int8_t>) {
|
||||
res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, int16_t>) {
|
||||
res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, int32_t>) {
|
||||
res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, uint8_t>) {
|
||||
res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, uint16_t>) {
|
||||
res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, uint32_t>) {
|
||||
res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, uint64_t>) {
|
||||
res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
res = DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, bool>) {
|
||||
res = DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
}
|
||||
if (res != NNACL_OK) {
|
||||
MS_LOG(ERROR) << "Transpose run failed";
|
||||
}
|
||||
} else {
|
||||
ParallelRun(input_addr, output_addr, output_shape, data_count);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) {
|
||||
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
|
||||
const float block_size = 128.0;
|
||||
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
|
||||
std::vector<common::Task> tasks;
|
||||
std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims;
|
||||
|
||||
if constexpr (std::is_same_v<T, int8_t>) {
|
||||
TransposeDims = &TransposeDimsInt8;
|
||||
} else if constexpr (std::is_same_v<T, int16_t>) {
|
||||
TransposeDims = &TransposeDimsInt16;
|
||||
} else if constexpr (std::is_same_v<T, int32_t>) {
|
||||
TransposeDims = &TransposeDimsInt32;
|
||||
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
TransposeDims = &TransposeDimsInt64;
|
||||
} else if constexpr (std::is_same_v<T, uint8_t>) {
|
||||
TransposeDims = &TransposeDimsUInt8;
|
||||
} else if constexpr (std::is_same_v<T, uint16_t>) {
|
||||
TransposeDims = &TransposeDimsUInt16;
|
||||
} else if constexpr (std::is_same_v<T, uint32_t>) {
|
||||
TransposeDims = &TransposeDimsUInt32;
|
||||
} else if constexpr (std::is_same_v<T, uint64_t>) {
|
||||
TransposeDims = &TransposeDimsUInt64;
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
TransposeDims = &TransposeDimsFp32;
|
||||
} else if constexpr (std::is_same_v<T, bool>) {
|
||||
TransposeDims = &TransposeDimsBool;
|
||||
}
|
||||
for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) {
|
||||
auto task = [&, task_id, thread_num]() {
|
||||
TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num));
|
||||
return common::SUCCESS;
|
||||
};
|
||||
tasks.emplace_back(task);
|
||||
}
|
||||
common::ThreadPool::GetInstance().SyncRun(tasks);
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h"
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "common/thread_pool.h"
|
||||
#include "nnacl/fp32/transpose_fp32.h"
|
||||
#include "nnacl/int8/transpose_int8.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
auto tmp = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm");
|
||||
axes_ = {tmp.begin(), tmp.end()};
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
if (axes_.size() > MAX_TRANSPOSE_DIM_SIZE) {
|
||||
MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_TRANSPOSE_DIM_SIZE << "D, but got "
|
||||
<< axes_.size() << "D.";
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < axes_.size(); ++i) {
|
||||
transpose_param_.perm_[i] = SizeToInt(axes_[i]);
|
||||
}
|
||||
int num_axes = SizeToInt(input_shape_.size());
|
||||
transpose_param_.perm_size_ = axes_.size();
|
||||
transpose_param_.num_axes_ = num_axes;
|
||||
transpose_param_.strides_[num_axes - 1] = 1;
|
||||
transpose_param_.out_strides_[num_axes - 1] = 1;
|
||||
for (int i = num_axes - 2; i >= 0; i--) {
|
||||
transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1];
|
||||
transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1];
|
||||
}
|
||||
launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel<int8_t>;
|
||||
launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel<int16_t>;
|
||||
launch_map_[kNumberTypeInt32] = &TransposeCPUFwdKernel::LaunchKernel<int>;
|
||||
launch_map_[kNumberTypeInt64] = &TransposeCPUFwdKernel::LaunchKernel<int64_t>;
|
||||
launch_map_[kNumberTypeUInt8] = &TransposeCPUFwdKernel::LaunchKernel<uint8_t>;
|
||||
launch_map_[kNumberTypeUInt16] = &TransposeCPUFwdKernel::LaunchKernel<uint16_t>;
|
||||
launch_map_[kNumberTypeUInt32] = &TransposeCPUFwdKernel::LaunchKernel<uint32_t>;
|
||||
launch_map_[kNumberTypeUInt64] = &TransposeCPUFwdKernel::LaunchKernel<uint64_t>;
|
||||
launch_map_[kNumberTypeFloat32] = &TransposeCPUFwdKernel::LaunchKernel<float>;
|
||||
launch_map_[kNumberTypeBool] = &TransposeCPUFwdKernel::LaunchKernel<bool>;
|
||||
|
||||
auto iter = launch_map_.find(dtype_);
|
||||
if (iter != launch_map_.end()) {
|
||||
launch_func_ = iter->second;
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Input data type: " << dtype_ << "is not supported for Transpose kernel on CPU.";
|
||||
}
|
||||
}
|
||||
|
||||
bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
launch_func_(this, inputs, outputs);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
transpose_param_.data_num_ = inputs[0]->size / sizeof(T);
|
||||
int output_shape[SizeToInt(output_shape_.size())];
|
||||
for (size_t i = 0; i < output_shape_.size(); ++i) {
|
||||
output_shape[i] = SizeToInt(output_shape_[i]);
|
||||
}
|
||||
size_t data_count = (inputs[0]->size) / sizeof(T);
|
||||
if (axes_.size() <= DIMENSION_6D && data_count < MAX_TRANSPOSE_SERIAL_SIZE) {
|
||||
int res = NNACL_ERR;
|
||||
if constexpr (std::is_same_v<T, int8_t>) {
|
||||
res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, int16_t>) {
|
||||
res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, int32_t>) {
|
||||
res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, uint8_t>) {
|
||||
res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, uint16_t>) {
|
||||
res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, uint32_t>) {
|
||||
res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, uint64_t>) {
|
||||
res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
res = DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
} else if constexpr (std::is_same_v<T, bool>) {
|
||||
res = DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_);
|
||||
}
|
||||
if (res != NNACL_OK) {
|
||||
MS_LOG(ERROR) << "Transpose run failed";
|
||||
}
|
||||
} else {
|
||||
ParallelRun(input_addr, output_addr, output_shape, data_count);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) {
|
||||
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
|
||||
const float block_size = 128.0;
|
||||
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
|
||||
std::vector<common::Task> tasks;
|
||||
std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims;
|
||||
|
||||
if constexpr (std::is_same_v<T, int8_t>) {
|
||||
TransposeDims = &TransposeDimsInt8;
|
||||
} else if constexpr (std::is_same_v<T, int16_t>) {
|
||||
TransposeDims = &TransposeDimsInt16;
|
||||
} else if constexpr (std::is_same_v<T, int32_t>) {
|
||||
TransposeDims = &TransposeDimsInt32;
|
||||
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
TransposeDims = &TransposeDimsInt64;
|
||||
} else if constexpr (std::is_same_v<T, uint8_t>) {
|
||||
TransposeDims = &TransposeDimsUInt8;
|
||||
} else if constexpr (std::is_same_v<T, uint16_t>) {
|
||||
TransposeDims = &TransposeDimsUInt16;
|
||||
} else if constexpr (std::is_same_v<T, uint32_t>) {
|
||||
TransposeDims = &TransposeDimsUInt32;
|
||||
} else if constexpr (std::is_same_v<T, uint64_t>) {
|
||||
TransposeDims = &TransposeDimsUInt64;
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
TransposeDims = &TransposeDimsFp32;
|
||||
} else if constexpr (std::is_same_v<T, bool>) {
|
||||
TransposeDims = &TransposeDimsBool;
|
||||
}
|
||||
for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) {
|
||||
auto task = [&, task_id, thread_num]() {
|
||||
TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num));
|
||||
return common::SUCCESS;
|
||||
};
|
||||
tasks.emplace_back(task);
|
||||
}
|
||||
common::ThreadPool::GetInstance().SyncRun(tasks);
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,58 +1,58 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
#include "nnacl/base/transpose_base.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class TransposeCPUFwdKernel : public CPUKernel {
|
||||
public:
|
||||
TransposeCPUFwdKernel() = default;
|
||||
~TransposeCPUFwdKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
|
||||
template <typename T>
|
||||
void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count);
|
||||
|
||||
TransposeParameter transpose_param_;
|
||||
std::vector<size_t> input_shape_;
|
||||
std::vector<size_t> output_shape_;
|
||||
std::vector<size_t> axes_;
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
using TypeKernel =
|
||||
std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>;
|
||||
std::unordered_map<TypeId, TypeKernel> launch_map_;
|
||||
TypeKernel launch_func_;
|
||||
};
|
||||
MS_REG_CPU_KERNEL(Transpose, KernelAttr(), TransposeCPUFwdKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
#include "nnacl/base/transpose_base.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class TransposeCPUFwdKernel : public CPUKernel {
|
||||
public:
|
||||
TransposeCPUFwdKernel() = default;
|
||||
~TransposeCPUFwdKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
|
||||
template <typename T>
|
||||
void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count);
|
||||
|
||||
TransposeParameter transpose_param_;
|
||||
std::vector<size_t> input_shape_;
|
||||
std::vector<size_t> output_shape_;
|
||||
std::vector<size_t> axes_;
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
using TypeKernel =
|
||||
std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>;
|
||||
std::unordered_map<TypeId, TypeKernel> launch_map_;
|
||||
TypeKernel launch_func_;
|
||||
};
|
||||
MS_REG_CPU_KERNEL(Transpose, KernelAttr(), TransposeCPUFwdKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
|
||||
|
|
Loading…
Reference in New Issue