Gpu Slice kernel performance improvement

This commit is contained in:
wilfChen 2020-04-15 09:11:07 +08:00
parent d9dd6aa0b8
commit 5b7790a2a7
4 changed files with 60 additions and 33 deletions

View File

@ -41,8 +41,9 @@ class SliceGpuFwdKernel : public GpuKernel {
CalStridedSlice(output_size_ / sizeof(T), input, input_shape_, begin_, size_, strides_, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CalSlice(output_size_ / sizeof(T), input, input_shape_, begin_, size_, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
Slice4DKernel(begin_[0], begin_[1], begin_[2], begin_[3], size_[0], size_[1], size_[2], size_[3], input_shape_[0],
input_shape_[1], input_shape_[2], input_shape_[3], input, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
return true;
}

View File

@ -21,11 +21,22 @@
#include "kernel/gpu/cuda_impl/slice_impl.cuh"
template <typename T>
__global__ void Slice(const T* input, int p, int start, int length, T* output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (length); pos += blockDim.x * gridDim.x) {
output[p + pos] = input[start + pos];
__global__ void Slice4D(const int s1, const int s2, const int s3, const int s4,
const int l1, const int l2, const int l3, const int l4,
const int d1, const int d2, const int d3, const int d4,
const T *input, T *output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (l1 * l2 * l3 * l4); pos += blockDim.x * gridDim.x) {
int i = pos / (l2 * l3 * l4) % l1;
int j = pos / (l3 * l4) % l2;
int k = pos / l4 % l3;
int o = pos % l4;
int offset = (i + s1) * (d2 * d3 * d4) +
(j + s2) * (d3 * d4) +
(k + s3) * d4 +
(o + s4);
output[pos] = input[offset];
}
return;
}
template <typename T>
__global__ void SliceGrad(const T* dy, int p, int start, int length, T* output) {
@ -64,22 +75,12 @@ void FillDeviceArray(const size_t input_size, T* addr, const float value, cudaSt
return;
}
template <typename T>
void CalSlice(const size_t input_size, const T* input, const std::vector<int> in_shape, const std::vector<int> begin,
const std::vector<int> size, T* output, cudaStream_t cuda_stream) {
int block = in_shape[1] * in_shape[2] * in_shape[3];
int map = in_shape[2] * in_shape[3];
int w = in_shape[3];
int length = size[3];
int p = 0;
for (int i = begin[0]; i < size[0] + begin[0]; i++) {
for (int j = begin[1]; j < size[1] + begin[1]; j++) {
for (int k = begin[2]; k < size[2] + begin[2]; k++) {
Slice<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input, p, i * block + j * map + k * w + begin[3],
length, output);
p = p + size[3];
}
}
}
void Slice4DKernel(const int s1, const int s2, const int s3, const int s4,
const int l1, const int l2, const int l3, const int l4,
const int d1, const int d2, const int d3, const int d4,
const T *input, T *output, cudaStream_t stream) {
Slice4D<<<GET_BLOCKS(l1 * l2 * l3 * l4), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, l1, l2, l3, l4,
d1, d2, d3, d4, input, output);
}
template <typename T>
void CalSliceGrad(const size_t input_size, const T* dy, const std::vector<int> in_shape, const std::vector<int> begin,
@ -147,9 +148,10 @@ void CalStridedSliceGrad(const size_t input_size, const T* dy, const std::vector
}
template void FillDeviceArray<float>(const size_t input_size, float* addr, const float value, cudaStream_t cuda_stream);
template void CalSlice<float>(const size_t input_size, const float* input, const std::vector<int> in_shape,
const std::vector<int> begin, const std::vector<int> size, float* output,
cudaStream_t cuda_stream);
template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4,
const int l1, const int l2, const int l3, const int l4,
const int d1, const int d2, const int d3, const int d4,
const float *input, float *output, cudaStream_t stream);
template void CalSliceGrad<float>(const size_t input_size, const float* dy, const std::vector<int> in_shape,
const std::vector<int> begin, const std::vector<int> size, float* output,
cudaStream_t cuda_stream);
@ -160,9 +162,10 @@ template void CalStridedSliceGrad<float>(const size_t input_size, const float* d
const std::vector<int> begin, const std::vector<int> end,
const std::vector<int> strides, float* dx, cudaStream_t cuda_stream);
template void FillDeviceArray<half>(const size_t input_size, half* addr, const float value, cudaStream_t cuda_stream);
template void CalSlice<half>(const size_t input_size, const half* input, const std::vector<int> in_shape,
const std::vector<int> begin, const std::vector<int> size, half* output,
cudaStream_t cuda_stream);
template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4,
const int l1, const int l2, const int l3, const int l4,
const int d1, const int d2, const int d3, const int d4,
const half *input, half *output, cudaStream_t stream);
template void CalSliceGrad<half>(const size_t input_size, const half* dy, const std::vector<int> in_shape,
const std::vector<int> begin, const std::vector<int> size, half* output,
cudaStream_t cuda_stream);
@ -173,9 +176,10 @@ template void CalStridedSliceGrad<half>(const size_t input_size, const half* dy,
const std::vector<int> begin, const std::vector<int> end,
const std::vector<int> strides, half* dx, cudaStream_t cuda_stream);
template void FillDeviceArray<int>(const size_t input_size, int* addr, const float value, cudaStream_t cuda_stream);
template void CalSlice<int>(const size_t input_size, const int* input, const std::vector<int> in_shape,
const std::vector<int> begin, const std::vector<int> size, int* output,
cudaStream_t cuda_stream);
template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4,
const int l1, const int l2, const int l3, const int l4,
const int d1, const int d2, const int d3, const int d4,
const int *input, int *output, cudaStream_t stream);
template void CalSliceGrad<int>(const size_t input_size, const int* dy, const std::vector<int> in_shape,
const std::vector<int> begin, const std::vector<int> size, int* output,
cudaStream_t cuda_stream);

View File

@ -21,9 +21,12 @@
#include <vector>
#include "device/gpu/cuda_common.h"
template <typename T>
void CalSlice(const size_t input_size, const T* input, const std::vector<int> in_shape, const std::vector<int> begin,
const std::vector<int> size, T* output, cudaStream_t cuda_stream);
void Slice4DKernel(const int s1, const int s2, const int s3, const int s4,
const int l1, const int l2, const int l3, const int l4,
const int d1, const int d2, const int d3, const int d4,
const T *input, T *output, cudaStream_t stream);
template <typename T>
void CalSliceGrad(const size_t input_size, const T* input, const std::vector<int> in_shape,
const std::vector<int> begin, const std::vector<int> size, T* output, cudaStream_t cuda_stream);

View File

@ -43,3 +43,22 @@ def test_slice():
slice = Slice()
output = slice(x)
assert (output.asnumpy() == expect).all()
class SliceNet(nn.Cell):
def __init__(self):
super(SliceNet, self).__init__()
self.slice = P.Slice()
def construct(self, x):
return self.slice(x, (0, 11, 0, 0), (32, 7, 224, 224))
def test_slice_4d():
x_np = np.random.randn(32, 24, 224, 224).astype(np.float32)
output_np = x_np[:, 11:18, :, :]
x_ms = Tensor(x_np)
net = SliceNet()
output_ms = net(x_ms)
assert (output_ms.asnumpy() == output_np).all()