Add CPU kernels: TensorAdd, Sub, Mul, Div

This commit is contained in:
huanghui 2020-09-10 14:11:16 +08:00
parent 55372b41fd
commit a82577c00e
8 changed files with 388 additions and 98 deletions

View File

@ -0,0 +1,137 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h"
#include <thread>
#include <string>
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
namespace {
template <typename T>
void Add(const T *input1, const T *input2, T *out, size_t start, size_t end, bool is_number) {
for (size_t i = start; i < end; i++) {
out[i] = input1[i] + (is_number ? *input2 : input2[i]);
}
}
template <typename T>
void Sub(const T *input1, const T *input2, T *out, size_t start, size_t end, bool is_number) {
for (size_t i = start; i < end; i++) {
out[i] = input1[i] - (is_number ? *input2 : input2[i]);
}
}
template <typename T>
void Mul(const T *input1, const T *input2, T *out, size_t start, size_t end, bool is_number) {
for (size_t i = start; i < end; i++) {
out[i] = input1[i] * (is_number ? *input2 : input2[i]);
}
}
template <typename T>
void Div(const T *input1, const T *input2, T *out, size_t start, size_t end, bool is_number) {
for (size_t i = start; i < end; i++) {
auto div_number = is_number ? *input2 : input2[i];
if (div_number == 0) {
MS_LOG(EXCEPTION) << "Cannot divided by 0!";
}
out[i] = input1[i] / div_number;
}
}
} // namespace
void ArithmeticCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
if (kernel_name == prim::kPrimTensorAdd->name()) {
operate_type_ = ADD;
} else if (kernel_name == prim::kPrimSub->name()) {
operate_type_ = SUB;
} else if (kernel_name == prim::kPrimMul->name()) {
operate_type_ = MUL;
} else if (kernel_name == "Div") {
operate_type_ = DIV;
}
auto shape0 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto shape1 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
if (shape1.size() == 0) {
is_number_ = true;
} else {
is_number_ = false;
if (shape0.size() != shape1.size()) {
MS_LOG(EXCEPTION) << "Input0 and input1 must has the same shape";
}
for (size_t i = 0; i < shape0.size(); ++i) {
if (shape0[i] != shape1[i]) {
MS_LOG(EXCEPTION) << "Input0 and input1 must has the same shape";
}
}
}
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
if (dtype_ != AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 1)) {
MS_LOG(EXCEPTION) << "Input0 and input1 must has the same data type";
}
}
bool ArithmeticCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> & /*workspace*/,
const std::vector<kernel::AddressPtr> &outputs) {
if (dtype_ == kNumberTypeInt32) {
LaunchKernel<int>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
} else if (dtype_ == kNumberTypeInt64) {
LaunchKernel<int64_t>(inputs, outputs);
} else {
MS_LOG(EXCEPTION) << "Only support int32, float32, but actual data type is " << TypeIdLabel(dtype_);
}
return true;
}
template <typename T>
void ArithmeticCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
T *input1 = reinterpret_cast<T *>(inputs[0]->addr);
T *input2 = reinterpret_cast<T *>(inputs[1]->addr);
T *output = reinterpret_cast<T *>(outputs[0]->addr);
auto lens = inputs[0]->size / sizeof(T);
MS_LOG(INFO) << "lens=" << lens;
const size_t thread_num = 24;
std::vector<std::thread> threads;
threads.reserve(thread_num);
size_t start = 0;
size_t once_compute_size = (lens + thread_num - 1) / thread_num;
while (start < lens) {
size_t end = (start + once_compute_size) > lens ? lens : (start + once_compute_size);
if (operate_type_ == ADD) {
threads.emplace_back(std::thread(Add<T>, input1, input2, output, start, end, is_number_));
} else if (operate_type_ == SUB) {
threads.emplace_back(std::thread(Sub<T>, input1, input2, output, start, end, is_number_));
} else if (operate_type_ == MUL) {
threads.emplace_back(std::thread(Mul<T>, input1, input2, output, start, end, is_number_));
} else if (operate_type_ == DIV) {
threads.emplace_back(std::thread(Div<T>, input1, input2, output, start, end, is_number_));
}
start += once_compute_size;
}
for (size_t i = 0; i < threads.size(); ++i) {
threads[i].join();
}
}
} // namespace kernel
} // namespace mindspore

View File

@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SUB_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SUB_CPU_KERNEL_H_
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
@ -22,24 +22,35 @@
namespace mindspore {
namespace kernel {
class SubCPUKernel : public CPUKernel {
class ArithmeticCPUKernel : public CPUKernel {
public:
SubCPUKernel() : offset_(0) {}
~SubCPUKernel() override = default;
ArithmeticCPUKernel() = default;
~ArithmeticCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
private:
int offset_;
bool is_number_{false};
OperateType operate_type_{ADD};
TypeId dtype_{kTypeUnknown};
};
MS_REG_CPU_KERNEL(
Sub, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
SubCPUKernel);
ArithmeticCPUKernel);
MS_REG_CPU_KERNEL(
Sub, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
ArithmeticCPUKernel);
MS_REG_CPU_KERNEL(
Sub, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
ArithmeticCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SUB_CPU_KERNEL_H_
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_

View File

@ -0,0 +1,91 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h"
#include <cmath>
#include <thread>
#include <string>
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
namespace {
template <typename T>
void Square(const T *in, T *out, size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
out[i] = in[i] * in[i];
}
}
template <typename T>
void Sqrt(const T *in, T *out, size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
out[i] = sqrtf(in[i]);
}
}
} // namespace
void ArithmeticSelfCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
if (kernel_name == prim::kPrimSquare->name()) {
operate_type_ = SQUARE;
} else if (kernel_name == prim::kPrimSqrt->name()) {
operate_type_ = SQRT;
}
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
}
bool ArithmeticSelfCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> & /*workspace*/,
const std::vector<kernel::AddressPtr> &outputs) {
if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
} else if (dtype_ == kNumberTypeInt32) {
LaunchKernel<float>(inputs, outputs);
} else {
MS_LOG(EXCEPTION) << "Only support float32, int32, but actual data type is " << TypeIdLabel(dtype_);
}
return true;
}
template <typename T>
void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
T *input = reinterpret_cast<T *>(inputs[0]->addr);
T *output = reinterpret_cast<T *>(outputs[0]->addr);
auto lens = inputs[0]->size / sizeof(T);
MS_LOG(INFO) << "lens=" << lens;
const size_t thread_num = 24;
std::vector<std::thread> threads;
threads.reserve(thread_num);
size_t start = 0;
size_t once_compute_size = (lens + thread_num - 1) / thread_num;
while (start < lens) {
size_t end = (start + once_compute_size) > lens ? lens : (start + once_compute_size);
if (operate_type_ == SQUARE) {
threads.emplace_back(std::thread(Square<T>, input, output, start, end));
} else if (operate_type_ == SQRT) {
threads.emplace_back(std::thread(Sqrt<T>, input, output, start, end));
}
start += once_compute_size;
}
for (size_t i = 0; i < threads.size(); ++i) {
threads[i].join();
}
}
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,50 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class ArithmeticSelfCPUKernel : public CPUKernel {
public:
ArithmeticSelfCPUKernel() = default;
~ArithmeticSelfCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
private:
OperateType operate_type_{SQUARE};
TypeId dtype_{kTypeUnknown};
};
MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
ArithmeticSelfCPUKernel);
MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
ArithmeticSelfCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_

View File

@ -51,6 +51,7 @@ const char END[] = "end";
const char SIZE[] = "size";
const char USE_NESTEROV[] = "use_nesterov";
const char GROUP[] = "group";
enum OperateType { ADD = 0, SUB, MUL, DIV, SQUARE, SQRT };
class CPUKernel : public kernel::KernelMod {
public:

View File

@ -1,90 +0,0 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/sub_cpu_kernel.h"
#include <sys/time.h>
#include <thread>
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
void SubCPUKernel::InitKernel(const CNodePtr &kernel_node) {
auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
if (shape.size() == 1) {
if (shape[0] != 1) {
MS_LOG(EXCEPTION) << "input 1 only support scalar";
}
} else {
MS_LOG(EXCEPTION) << "input 1 only support scalar";
}
}
void sub_task(const int *in_addr, int *out_addr, size_t lens, int offset) {
for (size_t i = 0; i < lens; i++) {
out_addr[i] = in_addr[i] - offset;
}
}
bool SubCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> & /*workspace*/,
const std::vector<kernel::AddressPtr> &outputs) {
#if defined(_WIN32) || defined(_WIN64)
auto start_time = std::chrono::steady_clock::now();
#else
struct timeval start_time, end_time;
(void)gettimeofday(&start_time, nullptr);
#endif
auto input_addr = reinterpret_cast<int *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<int *>(outputs[0]->addr);
offset_ = *reinterpret_cast<int *>(inputs[1]->addr);
MS_LOG(INFO) << "offset: " << offset_;
auto lens = inputs[0]->size / sizeof(int);
if (lens < 10000) {
for (size_t i = 0; i < lens; i++) {
output_addr[i] = input_addr[i] - offset_;
}
} else {
const size_t thread_num = 4;
std::thread threads[4];
size_t process_lens = (lens + thread_num - 1) / thread_num;
size_t process_offset = 0;
for (size_t i = 0; i < thread_num; i++) {
threads[i] =
std::thread(sub_task, input_addr + process_offset, output_addr + process_offset, process_lens, offset_);
if (process_offset + process_lens > lens) {
process_lens = lens - process_offset;
process_offset = lens;
} else {
process_offset += process_lens;
}
}
for (size_t i = 0; i < thread_num; i++) {
threads[i].join();
}
}
#if defined(_WIN32) || defined(_WIN64)
auto end_time = std::chrono::steady_clock::now();
std::chrono::duration<double, std::ratio<1, 1000000>> cost = end_time - start_time;
MS_LOG(INFO) << "SubscaleCPUKernel, used time: " << cost.count() << " us";
#else
(void)gettimeofday(&end_time, nullptr);
uint64_t time = 1000000 * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
time += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
MS_LOG(INFO) << "SubCPUKernel, used time: " << time << " us";
#endif
return true;
}
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,46 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import numpy as np
import pytest
import mindspore.context as context
import mindspore.nn as nn
import mindspore
from mindspore import Tensor
from mindspore.ops import operations as P
context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
class SubNet(nn.Cell):
def __init__(self):
super(SubNet, self).__init__()
self.sub = P.Sub()
def construct(self, x, y):
return self.sub(x, y)
@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
def test_sub():
x = np.ones([2, 3, 4, 4]).astype(np.int32)
y = 1
net = SubNet()
output = net(Tensor(x), Tensor(y, mindspore.int32))
expect_output = np.zeros([2, 3, 4, 4]).astype(np.int)
print(output)
assert np.all(output.asnumpy() == expect_output)

View File

@ -0,0 +1,44 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import numpy as np
import pytest
import mindspore.context as context
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.ops import operations as P
context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
class SquareNet(nn.Cell):
def __init__(self):
super(SquareNet, self).__init__()
self.square = P.Square()
def construct(self, x):
return self.square(x)
@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
def test_square():
x = np.array([1, 2, 3]).astype(np.float32)
net = SquareNet()
output = net(Tensor(x))
expect_output = np.array([1, 4, 9]).astype(np.float32)
print(output)
assert np.all(output.asnumpy() == expect_output)