This commit is contained in:
zong-shuai 2023-01-05 14:58:44 +08:00
parent fefa5651e5
commit 4dae89b273
5 changed files with 51 additions and 48 deletions

View File

@ -125,8 +125,8 @@ inline int Log2Ceil64(uint64_t n) {
}
template <typename IndexType>
__global__ void CoalesceKernelCheck(IndexType *indices_ptr, IndexType *segment_ids_ptr, IndexType *num_segments_ptr,
size_t outer_size, int *ret_flag, size_t indices_size) {
__global__ void InputValidCheck(IndexType *indices_ptr, IndexType *segment_ids_ptr, IndexType *num_segments_ptr,
size_t outer_size, int *ret_flag, size_t indices_size) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < indices_size; i += gridDim.x * blockDim.x) {
if ((i != indices_size - 1) && (segment_ids_ptr[i] > segment_ids_ptr[i + 1])) {
*ret_flag = 1;
@ -144,32 +144,32 @@ __global__ void CoalesceKernelCheck(IndexType *indices_ptr, IndexType *segment_i
}
template <typename DataType, typename IndexType>
CUDA_LIB_EXPORT void CalSparseSegmentMeanWithNumSegments(const DataType *x_ptr, const IndexType *indices_ptr,
const IndexType *segment_ids_ptr,
const IndexType *num_segments_ptr, size_t *segment_pos_ptr,
DataType *y_ptr, size_t outer_size, size_t inner_size,
size_t indices_size, size_t segment_size, size_t x_size,
size_t y_size, size_t batch_size, int *ret_flag_host,
uint32_t device_id, cudaStream_t cuda_stream) {
CUDA_LIB_EXPORT int CalSparseSegmentMeanWithNumSegments(const DataType *x_ptr, const IndexType *indices_ptr,
const IndexType *segment_ids_ptr,
const IndexType *num_segments_ptr, size_t *segment_pos_ptr,
DataType *y_ptr, size_t outer_size, size_t inner_size,
size_t indices_size, size_t segment_size, size_t x_size,
size_t y_size, size_t batch_size, int *ret_flag_device,
uint32_t device_id, cudaStream_t cuda_stream) {
// Get start position of each segment and set to segment_pos_ptr.
// The last element of segment_pos_ptr must equal to segment_size.
int *ret_flag_device = nullptr;
(void)cudaMalloc(&ret_flag_device, sizeof(int));
(void)cudaMemset(ret_flag_device, 0, sizeof(int));
CoalesceKernelCheck<<<CUDA_BLOCKS(device_id, indices_size + 1), CUDA_THREADS(device_id), 0, cuda_stream>>>(
int ret_flag_host = 0;
int thread_num = indices_size + 1 > 256 ? 256 : (indices_size + 1);
(void)cudaMemsetAsync(ret_flag_device, 0, sizeof(int), cuda_stream);
InputValidCheck<<<CUDA_BLOCKS_CAL(device_id, indices_size + 1, thread_num), thread_num, 0, cuda_stream>>>(
indices_ptr, segment_ids_ptr, num_segments_ptr, outer_size, ret_flag_device, indices_size);
(void)cudaMemcpy(ret_flag_host, ret_flag_device, sizeof(int), cudaMemcpyDeviceToHost);
(void)cudaFree(ret_flag_device);
if (*ret_flag_host != 0) {
return;
(void)cudaMemcpyAsync(&ret_flag_host, ret_flag_device, sizeof(int), cudaMemcpyDeviceToHost, cuda_stream);
cudaStreamSynchronize(cuda_stream);
if (ret_flag_host != 0) {
return ret_flag_host;
}
SparseSegmentPosKernel<<<CUDA_BLOCKS(device_id, indices_size + 1), CUDA_THREADS(device_id), 0, cuda_stream>>>(
SparseSegmentPosKernel<<<CUDA_BLOCKS_CAL(device_id, indices_size + 1, thread_num), thread_num, 0, cuda_stream>>>(
segment_ids_ptr, segment_pos_ptr, indices_size, segment_size);
const unsigned int max_grid_x = (1u << 31) - 1;
const unsigned int max_grid_y = (1u << 16) - 1;
const unsigned int max_block_x = 1024;
const unsigned int max_block_y = 64;
const unsigned int max_block_x = 64;
const unsigned int max_block_y = 8;
unsigned int inner_power2 = 1u << Log2Ceil64(inner_size);
unsigned int avg_reduce_size = UP_DIV(outer_size, segment_size);
unsigned int avg_reduce_size_power2 = 1u << Log2Ceil64(avg_reduce_size);
@ -188,32 +188,32 @@ CUDA_LIB_EXPORT void CalSparseSegmentMeanWithNumSegments(const DataType *x_ptr,
SparseSegmentMeanWithNumSegmentsKernel<<<grid, block, shared_memory_size, cuda_stream>>>(
batch_x_ptr, batch_indices_ptr, segment_pos_ptr, batch_y_ptr, outer_size, inner_size, segment_size);
}
return;
return ret_flag_host;
}
template CUDA_LIB_EXPORT void CalSparseSegmentMeanWithNumSegments<half, int32_t>(
template CUDA_LIB_EXPORT int CalSparseSegmentMeanWithNumSegments<half, int32_t>(
const half *x_ptr, const int32_t *indices_ptr, const int32_t *segment_ids_ptr, const int32_t *num_segments_ptr,
size_t *segment_pos_ptr, half *y_ptr, size_t outer_size, size_t inner_size, size_t indices_size, size_t segment_size,
size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_host, uint32_t device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalSparseSegmentMeanWithNumSegments<float, int32_t>(
size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_device, uint32_t device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT int CalSparseSegmentMeanWithNumSegments<float, int32_t>(
const float *x_ptr, const int32_t *indices_ptr, const int32_t *segment_ids_ptr, const int32_t *num_segments_ptr,
size_t *segment_pos_ptr, float *y_ptr, size_t outer_size, size_t inner_size, size_t indices_size, size_t segment_size,
size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_host, uint32_t device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalSparseSegmentMeanWithNumSegments<double, int32_t>(
size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_device, uint32_t device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT int CalSparseSegmentMeanWithNumSegments<double, int32_t>(
const double *x_ptr, const int32_t *indices_ptr, const int32_t *segment_ids_ptr, const int32_t *num_segments_ptr,
size_t *segment_pos_ptr, double *y_ptr, size_t outer_size, size_t inner_size, size_t indices_size,
size_t segment_size, size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_host, uint32_t device_id,
size_t segment_size, size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_device, uint32_t device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalSparseSegmentMeanWithNumSegments<half, int64_t>(
template CUDA_LIB_EXPORT int CalSparseSegmentMeanWithNumSegments<half, int64_t>(
const half *x_ptr, const int64_t *indices_ptr, const int64_t *segment_ids_ptr, const int64_t *num_segments_ptr,
size_t *segment_pos_ptr, half *y_ptr, size_t outer_size, size_t inner_size, size_t indices_size, size_t segment_size,
size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_host, uint32_t device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalSparseSegmentMeanWithNumSegments<float, int64_t>(
size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_device, uint32_t device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT int CalSparseSegmentMeanWithNumSegments<float, int64_t>(
const float *x_ptr, const int64_t *indices_ptr, const int64_t *segment_ids_ptr, const int64_t *num_segments_ptr,
size_t *segment_pos_ptr, float *y_ptr, size_t outer_size, size_t inner_size, size_t indices_size, size_t segment_size,
size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_host, uint32_t device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalSparseSegmentMeanWithNumSegments<double, int64_t>(
size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_device, uint32_t device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT int CalSparseSegmentMeanWithNumSegments<double, int64_t>(
const double *x_ptr, const int64_t *indices_ptr, const int64_t *segment_ids_ptr, const int64_t *num_segments_ptr,
size_t *segment_pos_ptr, double *y_ptr, size_t outer_size, size_t inner_size, size_t indices_size,
size_t segment_size, size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_host, uint32_t device_id,
size_t segment_size, size_t x_size, size_t y_size, size_t batch_size, int *ret_flag_device, uint32_t device_id,
cudaStream_t cuda_stream);

View File

@ -20,12 +20,12 @@
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
template <typename DataType, typename IndexType>
CUDA_LIB_EXPORT void CalSparseSegmentMeanWithNumSegments(const DataType *x_ptr, const IndexType *indices_ptr,
const IndexType *segment_ids_ptr,
const IndexType *num_segments_ptr, size_t *segment_pos_ptr,
DataType *y_ptr, size_t outer_size, size_t inner_size,
size_t indices_size, size_t segment_size, size_t x_size,
size_t y_size, size_t batch_size, int *ret_flag_host,
uint32_t device_id, cudaStream_t cuda_stream);
CUDA_LIB_EXPORT int CalSparseSegmentMeanWithNumSegments(const DataType *x_ptr, const IndexType *indices_ptr,
const IndexType *segment_ids_ptr,
const IndexType *num_segments_ptr, size_t *segment_pos_ptr,
DataType *y_ptr, size_t outer_size, size_t inner_size,
size_t indices_size, size_t segment_size, size_t x_size,
size_t y_size, size_t batch_size, int *ret_flag_device,
uint32_t device_id, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_SEGMENT_MEAN_WITH_NUM_SEGMENTS_IMPL_CUH_

View File

@ -56,6 +56,7 @@ int SparseSegmentMeanWithNumSegmentsGpuKernelMod::Resize(const BaseOperatorPtr &
y_size_ = std::accumulate(y_shape.begin() + batch_rank_, y_shape.end(), size_t(1), std::multiplies{});
segment_size_ = LongToSize(y_shape.at(batch_rank_));
workspace_size_list_.push_back((segment_size_ + 1) * sizeof(size_t));
workspace_size_list_.push_back(sizeof(int));
return ret;
}
@ -71,15 +72,16 @@ bool SparseSegmentMeanWithNumSegmentsGpuKernelMod::LaunchKernel(const std::vecto
auto segment_ids_ptr = GetDeviceAddress<IndexType>(inputs, kIndex2);
auto num_segments_ptr = GetDeviceAddress<IndexType>(inputs, kIndex3);
auto segment_pos_ptr = GetDeviceAddress<size_t>(workspace, kIndex0);
auto ret_flag_device = GetDeviceAddress<int>(workspace, kIndex1);
auto y_ptr = GetDeviceAddress<DataType>(outputs, kIndex0);
auto any = [](auto... args) -> bool { return ((args == nullptr) || ...); };
if (any(x_ptr, indices_ptr, segment_ids_ptr, num_segments_ptr, segment_pos_ptr, y_ptr)) {
return false;
cudaMemset(y_ptr, 0, outputs[0]->size);
return true;
}
int ret_flag_host = 0;
CalSparseSegmentMeanWithNumSegments(x_ptr, indices_ptr, segment_ids_ptr, num_segments_ptr, segment_pos_ptr, y_ptr,
outer_size_, inner_size_, indices_size_, segment_size_, x_size_, y_size_,
batch_size_, &ret_flag_host, device_id_, cuda_stream);
int ret_flag_host = CalSparseSegmentMeanWithNumSegments(
x_ptr, indices_ptr, segment_ids_ptr, num_segments_ptr, segment_pos_ptr, y_ptr, outer_size_, inner_size_,
indices_size_, segment_size_, x_size_, y_size_, batch_size_, ret_flag_device, device_id_, cuda_stream);
int FALSE_1 = 1;
int FALSE_2 = 2;
int FALSE_3 = 3;

View File

@ -99,7 +99,8 @@ bool SparseSegmentMeanGradGpuKernelMod::LaunchKernel(const std::vector<AddressPt
size_t *segment_pos_ptr = GetDeviceAddress<size_t>(workspace, kIndex0);
auto any = [](auto... args) -> bool { return ((args == nullptr) || ...); };
if (any(grad_ptr, indices_ptr, segment_ids_ptr, segment_pos_ptr, y_ptr)) {
return false;
cudaMemset(y_ptr, 0, outputs[0]->size);
return true;
}
cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream_);
std::vector<S> indices_host;

View File

@ -65,8 +65,8 @@ abstract::ShapePtr SparseSegmentMeanGradInferShape(const PrimitivePtr &prim,
auto output_dim0_value_ptr_tensor =
CheckAndConvertUtils::CheckTensorIntValue("output_dim0", output_dim0_value_ptr, prim_name);
int dim_zero = output_dim0_value_ptr_tensor[kShapeNum0];
if (dim_zero <= kDimNum0) {
MS_EXCEPTION(ValueError) << "Input output_dim0 must > 0!";
if (dim_zero < kDimNum0) {
MS_EXCEPTION(ValueError) << "Input output_dim0 must >= 0!";
} else {
y_shape[kShapeNum0] = dim_zero;
}