!3193 fix gpu Split and Concat memory allocation bug
Merge pull request !3193 from zhaoting/master
This commit is contained in:
commit
4e0cfafcf9
|
@ -74,12 +74,12 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
inputs_host_ = std::make_unique<T *[]>(input_num_);
|
||||
len_axis_ = std::make_unique<int[]>(input_num_);
|
||||
for (int i = 0; i < input_num_; i++) {
|
||||
int input_size = 1;
|
||||
size_t input_size = 1;
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
|
||||
for (size_t j = 0; j < input_shape.size(); j++) {
|
||||
input_size *= SizeToInt(input_shape[j]);
|
||||
input_size *= input_shape[j];
|
||||
}
|
||||
input_size_list_.push_back(IntToSize(input_size * sizeof(T)));
|
||||
input_size_list_.push_back(input_size * sizeof(T));
|
||||
len_axis_[i] = SizeToInt(input_shape[axis_]);
|
||||
}
|
||||
workspace_size_list_.push_back(sizeof(T *) * input_num_);
|
||||
|
@ -97,7 +97,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
all_size_before_axis_ *= output_shape[i];
|
||||
}
|
||||
}
|
||||
output_size_list_.push_back(IntToSize(output_size_ * sizeof(T)));
|
||||
output_size_list_.push_back(output_size_ * sizeof(T));
|
||||
|
||||
InitSizeLists();
|
||||
return true;
|
||||
|
@ -117,7 +117,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
int axis_;
|
||||
int input_num_;
|
||||
int output_size_;
|
||||
size_t output_size_;
|
||||
int all_size_before_axis_;
|
||||
int all_size_axis_;
|
||||
std::unique_ptr<T *[]> inputs_host_;
|
||||
|
|
|
@ -83,7 +83,7 @@ class SplitGpuFwdKernel : public GpuKernel {
|
|||
all_size_before_axis_ *= input_shape[i];
|
||||
}
|
||||
}
|
||||
input_size_list_.push_back(IntToSize(input_size_ * sizeof(T)));
|
||||
input_size_list_.push_back(input_size_ * sizeof(T));
|
||||
axis_step_ = input_shape[axis_] / output_num_;
|
||||
|
||||
for (int i = 0; i < output_num_; i++) {
|
||||
|
@ -138,7 +138,7 @@ class SplitGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
int axis_;
|
||||
int output_num_;
|
||||
int input_size_;
|
||||
size_t input_size_;
|
||||
int axis_step_;
|
||||
int all_size_before_axis_;
|
||||
int all_size_axis_;
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
|
||||
template <typename T>
|
||||
__global__ void Concat(const int size, const int input_num,
|
||||
__global__ void Concat(const size_t size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, T** inputs, T* output) {
|
||||
for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
|
||||
|
@ -45,7 +45,7 @@ __global__ void Concat(const int size, const int input_num,
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
void ConcatKernel(const int size, const int input_num,
|
||||
void ConcatKernel(const size_t size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, T** inputs, T* output,
|
||||
cudaStream_t cuda_stream) {
|
||||
|
@ -55,15 +55,15 @@ void ConcatKernel(const int size, const int input_num,
|
|||
return;
|
||||
}
|
||||
|
||||
template void ConcatKernel(const int size, const int input_num,
|
||||
template void ConcatKernel(const size_t size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, float** inputs, float* output,
|
||||
cudaStream_t cuda_stream);
|
||||
template void ConcatKernel(const int size, const int input_num,
|
||||
template void ConcatKernel(const size_t size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, int** inputs, int* output,
|
||||
cudaStream_t cuda_stream);
|
||||
template void ConcatKernel(const int size, const int input_num,
|
||||
template void ConcatKernel(const size_t size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, half** inputs, half* output,
|
||||
cudaStream_t cuda_stream);
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
#include "runtime/device/gpu/cuda_common.h"
|
||||
template <typename T>
|
||||
void ConcatKernel(const int size, const int input_num,
|
||||
void ConcatKernel(const size_t size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, T** inputs, T* output,
|
||||
cudaStream_t cuda_stream);
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
|
||||
template <typename T>
|
||||
__global__ void Split(const int size, const int axis_step, const int all_size_before_axis,
|
||||
__global__ void Split(const size_t size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const T* input, T** outputs) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
|
||||
int num = pos % all_size_before_axis / all_size_axis;
|
||||
|
@ -32,19 +32,19 @@ __global__ void Split(const int size, const int axis_step, const int all_size_be
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
|
||||
void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) {
|
||||
Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis,
|
||||
all_size_axis, input, outputs);
|
||||
return;
|
||||
}
|
||||
|
||||
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
|
||||
template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const float* input, float** outputs,
|
||||
cudaStream_t cuda_stream);
|
||||
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
|
||||
template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const int* input, int** outputs,
|
||||
cudaStream_t cuda_stream);
|
||||
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
|
||||
template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const half* input, half** outputs,
|
||||
cudaStream_t cuda_stream);
|
||||
|
|
|
@ -19,6 +19,6 @@
|
|||
|
||||
#include "runtime/device/gpu/cuda_common.h"
|
||||
template <typename T>
|
||||
void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
|
||||
void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream);
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
|
||||
|
|
Loading…
Reference in New Issue