the performance optimization of logsoftmax operator on gpu device. 50+ times better

2023-01-02 23:11:48 +08:00 · 2023-01-02 23:11:48 +08:00 · 777bec3cc1
parent 44cfcca6f4
commit 777bec3cc1
2 changed files with 92 additions and 120 deletions
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softmax_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softmax_impl.cu
@ -36,43 +36,48 @@ constexpr int ALIGN_BYTES = 16;
    }                                                                  \
  }

-inline dim3 SpatialSoftMax_getGridSize(dim3 *block, uint32_t max_active_blocks, uint64_t outer_size, uint64_t dim_size,
-                                       uint64_t inner_size) {
-  // First, tile as many blocks as we can over the y axis
-  uint32_t inner_blocks = (inner_size + block->y - 1) / block->y;
-  if (inner_blocks > max_active_blocks) inner_blocks = max_active_blocks;
-  // Fill the x axis with as many blocks as we can fit (a little more is ok too)
-  uint32_t outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks;
-  if (outer_blocks > outer_size) outer_blocks = outer_size;
-  return dim3(outer_blocks, inner_blocks);
+inline dim3 SpatialSoftMaxGetGridSize(dim3 *block, uint32_t activate_block, uint64_t outer_size, uint64_t dim_size,
+                                      uint64_t inner_size) {
+  uint32_t inner = (inner_size + block->y - 1) / block->y;
+  if (inner > activate_block) {
+    inner = activate_block;
+  }
+  uint32_t outer = (activate_block + inner - 1) / inner;
+  if (outer > outer_size) {
+    outer = outer_size;
+  }
+  return dim3(outer, inner);
 }

-inline dim3 SpatialSoftMax_getBlockSize(uint64_t outer_size, uint64_t dim_size, uint64_t inner_size) {
-  uint32_t inner_threads = inner_size;
-  inner_threads = std::min(inner_threads, static_cast<uint32_t>(max_threads));
+inline dim3 SpatialSoftMaxGetBlockSize(uint64_t outer_size, uint64_t dim_size, uint64_t inner_size) {
+  uint32_t inner_ths = inner_size;
+  inner_ths = std::min(inner_ths, static_cast<uint32_t>(max_threads));
  uint32_t dim_threads = 1;
-  if (inner_threads <= 64 && dim_size >= 64) {
-    while (inner_threads * dim_threads <= max_threads && dim_threads <= dim_size) dim_threads *= 2;
+  if (inner_ths <= 64 && dim_size >= 64) {
+    while ((inner_ths * dim_threads <= max_threads) && (dim_threads <= dim_size)) {
+      dim_threads *= 2;
+    }
    dim_threads /= 2;
  }
-  return dim3(dim_threads, inner_threads);
+  return dim3(dim_threads, inner_ths);
 }

 template <typename accumulate_t, typename Kernel>
-void SpatialSoftMax_getLaunchSizes(Kernel k, uint64_t outer_size, uint64_t dim_size, uint64_t inner_size, dim3 *grid,
-                                   dim3 *block, uint32_t *smem_size, uint32_t device_id) {
-  *block = SpatialSoftMax_getBlockSize(outer_size, dim_size, inner_size);
-  uint32_t block_threads = block->x * block->y;
-  *smem_size = block->x == 1 ? 0 : block_threads * sizeof(accumulate_t);
-
-  int max_active_blocks;
-
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, k, block_threads, *smem_size);
+void SpatialSoftMaxGetLaunchSizes(Kernel k, uint64_t outer_size, uint64_t dim_size, uint64_t inner_size, dim3 *grid,
+                                  dim3 *block, uint32_t *smem_size, uint32_t device_id) {
+  *block = SpatialSoftMaxGetBlockSize(outer_size, dim_size, inner_size);
+  uint32_t block_ths = block->x * block->y;
+  if (block->x == 1) {
+    *smem_size = 0;
+  } else {
+    *smem_size = block_ths * sizeof(accumulate_t);
+  }
+  int activate_size;
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&activate_size, k, block_ths, *smem_size);
  cudaDeviceProp prop;
  (void)cudaGetDeviceProperties(&prop, device_id);
-  max_active_blocks *= prop.multiProcessorCount;
-
-  *grid = SpatialSoftMax_getGridSize(block, max_active_blocks, outer_size, dim_size, inner_size);
+  activate_size *= prop.multiProcessorCount;
+  *grid = SpatialSoftMaxGetGridSize(block, activate_size, outer_size, dim_size, inner_size);
 }

 int log2_ceil(int val) {
@ -135,63 +140,58 @@ __forceinline__ __device__ T SpatialBlockReduceX(T *memsha, T val) {
  return memsha[0];
 }

-template <typename input_t, typename accumulate_t, typename output_t>
+template <typename input_t, typename accumulate_t, typename output_t, bool is_log_softmax>
 __global__ void SpatialSoftMaxForward(output_t *output, input_t *input, uint32_t outer_size, uint32_t dim_size,
                                      uint32_t inner_size) {
  extern __shared__ unsigned char smem[];
  auto sdata = reinterpret_cast<accumulate_t *>(smem);
  const uint32_t outer_stride = inner_size * dim_size;
  const uint32_t dim_stride = inner_size;
-
  for (uint32_t outer_index = blockIdx.x; outer_index < outer_size; outer_index += gridDim.x) {
    const uint32_t outer_offset = outer_index * outer_stride;
    for (uint32_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size;
         inner_index += blockDim.y * gridDim.y) {
-      const uint32_t data_offset = outer_offset + inner_index;
-
+      const uint32_t offset = outer_offset + inner_index;
      if (blockDim.x > 1) {
-        accumulate_t max_input = std::numeric_limits<accumulate_t>::lowest();
+        accumulate_t max_data_input = std::numeric_limits<accumulate_t>::lowest();
        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {
-          const accumulate_t value = static_cast<accumulate_t>(input[data_offset + d * dim_stride]);
-          max_input = atomic::Max()(max_input, value);
+          const accumulate_t value = static_cast<accumulate_t>(input[offset + d * dim_stride]);
+          max_data_input = atomic::Max()(max_data_input, value);
        }
-        max_input = SpatialBlockReduceX<accumulate_t, atomic::Max>(sdata, max_input);
-
+        max_data_input = SpatialBlockReduceX<accumulate_t, atomic::Max>(sdata, max_data_input);
        accumulate_t sum = 0;
        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
-          sum += std::exp(static_cast<accumulate_t>(input[data_offset + d * dim_stride]) - max_input);
+          sum += std::exp(static_cast<accumulate_t>(input[offset + d * dim_stride]) - max_data_input);
        sum = SpatialBlockReduceX<accumulate_t, atomic::Add>(sdata, sum);
-
-        SoftMaxForwardEpilogue<input_t, accumulate_t, output_t> epilogue(max_input, sum);
+        SoftMaxForwardEpilogue<input_t, accumulate_t, output_t, is_log_softmax> epilogue(max_data_input, sum);
        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
-          output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]);
+          output[offset + d * dim_stride] = epilogue(input[offset + d * dim_stride]);
      } else {
-        accumulate_t max_input = std::numeric_limits<accumulate_t>::lowest();
+        accumulate_t max_data_input = std::numeric_limits<accumulate_t>::lowest();
        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {
-          const accumulate_t value = static_cast<accumulate_t>(input[data_offset + d * dim_stride]);
-          max_input = atomic::Max()(max_input, value);
+          const accumulate_t value = static_cast<accumulate_t>(input[offset + d * dim_stride]);
+          max_data_input = atomic::Max()(max_data_input, value);
        }
        accumulate_t sum = 0;
        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
-          sum += std::exp(static_cast<accumulate_t>(input[data_offset + d * dim_stride]) - max_input);
-        SoftMaxForwardEpilogue<input_t, accumulate_t, output_t> epilogue(max_input, sum);
+          sum += std::exp(static_cast<accumulate_t>(input[offset + d * dim_stride]) - max_data_input);
+
+        SoftMaxForwardEpilogue<input_t, accumulate_t, output_t, is_log_softmax> epilogue(max_data_input, sum);
        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
-          output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]);
+          output[offset + d * dim_stride] = epilogue(input[offset + d * dim_stride]);
      }
    }
  }
 }

-template <int InsP, typename input_t, typename accum_t, typename output_t>
+template <int InsP, typename input_t, typename accum_t, typename output_t, bool is_log_softmax>
 __device__ __forceinline__ void WriteResults(int clas, input_t *input, output_t *output, input_t max_k,
                                             input_t sum_all) {
-  SoftMaxForwardEpilogue<input_t, accum_t, output_t> epilogue(max_k, sum_all);
+  SoftMaxForwardEpilogue<input_t, accum_t, output_t, is_log_softmax> epilogue(max_k, sum_all);
  int offset = threadIdx.x;
-
  int last = clas % (InsP * blockDim.x);
  for (; offset < clas - last; offset += blockDim.x * InsP) {
    input_t tmp[InsP];
-
 #pragma unroll
    for (int j = 0; j < InsP; ++j) {
      tmp[j] = input[offset + j * blockDim.x];
@ -201,54 +201,42 @@ __device__ __forceinline__ void WriteResults(int clas, input_t *input, output_t
      output[offset + j * blockDim.x] = epilogue(tmp[j]);
    }
  }
-
  for (; offset < clas; offset += blockDim.x) {
    output[offset] = epilogue(input[offset]);
  }
 }

-template <int InsP, typename input_t, typename accum_t, typename output_t>
-__device__ __forceinline__ void WriteResultsVectorized(int size, const int shift, input_t *input, output_t *output,
+template <int InsP, typename input_t, typename accum_t, typename output_t, bool is_log_softmax>
+__device__ __forceinline__ void WriteResultsVectorized(int size, const int deviate, input_t *input, output_t *output,
                                                       input_t max_k, input_t sum_all) {
-  SoftMaxForwardEpilogue<input_t, accum_t, output_t> epilogue(max_k, sum_all);
-
-  using LoadT = aligned_vector<input_t>;
-  using StoreT = aligned_vector<output_t>;
-
+  SoftMaxForwardEpilogue<input_t, accum_t, output_t, is_log_softmax> epilogue(max_k, sum_all);
+  using loadT = aligned_vector<input_t>;
+  using storeT = aligned_vector<output_t>;
  int offset = threadIdx.x;
-
-  if (shift > 0) {
-    input -= shift;
-    output -= shift;
-    size += shift;
-
-    if (threadIdx.x >= shift) {
+  if (deviate > 0) {
+    input -= deviate;
+    output -= deviate;
+    size += deviate;
+    if (threadIdx.x >= deviate) {
      output[offset] = epilogue(input[offset]);
    }
    size -= blockDim.x;
    input += blockDim.x;
    output += blockDim.x;
  }
-
  const int last = size % (InsP * blockDim.x);
-
  input_t in_v[InsP];
-  LoadT *in_value = reinterpret_cast<LoadT *>(&in_v);
-
+  loadT *in_value = reinterpret_cast<loadT *>(&in_v);
  output_t out_v[InsP];
-  StoreT *out_value = reinterpret_cast<StoreT *>(&out_v);
-
+  storeT *out_value = reinterpret_cast<storeT *>(&out_v);
  for (; offset * InsP < (size - last); offset += blockDim.x) {
-    *in_value = reinterpret_cast<LoadT *>(input)[offset];
-
+    *in_value = reinterpret_cast<loadT *>(input)[offset];
 #pragma unroll
    for (int j = 0; j < InsP; ++j) {
      out_v[j] = epilogue(in_v[j]);
    }
-
-    reinterpret_cast<StoreT *>(output)[offset] = *out_value;
+    reinterpret_cast<storeT *>(output)[offset] = *out_value;
  }
-
  offset = size - last + threadIdx.x;
  for (; offset < size; offset += blockDim.x) {
    output[offset] = epilogue(input[offset]);
@ -256,52 +244,43 @@ __device__ __forceinline__ void WriteResultsVectorized(int size, const int shift
 }

 template <typename Reduction, typename AccT>
-__device__ __forceinline__ AccT ReduceBlock(AccT *smem, AccT val, AccT defaultVal) {
+__device__ __forceinline__ AccT ReduceBlock(AccT *sharemen, AccT val, AccT initVal) {
  Reduction r = Reduction();
-
  __syncthreads();
-
-  smem[threadIdx.x] = val;
-
+  sharemen[threadIdx.x] = val;
  __syncthreads();
-
-  AccT warpVal = defaultVal;
-
+  AccT warpVal = initVal;
  uint32_t mask = (((uint64_t)1) << (blockDim.x / WARPSIZE)) - 1;
  if (threadIdx.x < WARPSIZE) {
    int lane = threadIdx.x % WARPSIZE;
    if (lane < blockDim.x / WARPSIZE) {
 #pragma unroll
      for (int i = 0; i < WARPSIZE; ++i) {
-        warpVal = r(warpVal, smem[lane * WARPSIZE + i]);
+        warpVal = r(warpVal, sharemen[lane * WARPSIZE + i]);
      }
 #ifndef __HIP_PLATFORM_HCC__
      __syncwarp(mask);
 #endif
-      smem[lane] = warpVal;
+      sharemen[lane] = warpVal;
    }
  }
-
  __syncthreads();
-  AccT blockVal = defaultVal;
-
+  AccT blockVal = initVal;
  if (threadIdx.x == 0) {
    for (int i = 0; i < blockDim.x / WARPSIZE; ++i) {
-      blockVal = r(blockVal, smem[i]);
+      blockVal = r(blockVal, sharemen[i]);
    }
-    smem[0] = blockVal;
+    sharemen[0] = blockVal;
  }
-
  __syncthreads();
-  return smem[0];
+  return sharemen[0];
 }

 template <template <typename, typename> class Reduction, int InsP, typename T, typename AccT>
-__device__ __forceinline__ AccT ILPReduce(int shift, T *data, int size, const Reduction<T, AccT> &r, AccT defaultVal) {
-  using LoadT = aligned_vector<T>;
-  AccT threadVal = defaultVal;
+__device__ __forceinline__ AccT ILPReduce(int shift, T *data, int size, const Reduction<T, AccT> &r, AccT initVal) {
+  using loadT = aligned_vector<T>;
+  AccT threadVal = initVal;
  int offset = threadIdx.x;
-
  if (shift > 0) {
    data -= shift;
    size += shift;
@ -313,10 +292,9 @@ __device__ __forceinline__ AccT ILPReduce(int shift, T *data, int size, const Re
  }
  int last = size % (InsP * blockDim.x);
  T v[InsP];
-  LoadT *value = reinterpret_cast<LoadT *>(&v);
+  loadT *value = reinterpret_cast<loadT *>(&v);
  for (; offset * InsP < (size - last); offset += blockDim.x) {
-    *value = reinterpret_cast<LoadT *>(data)[offset];
-
+    *value = reinterpret_cast<loadT *>(data)[offset];
 #pragma unroll
    for (int j = 0; j < InsP; ++j) {
      threadVal = r(threadVal, v[j]);
@ -324,7 +302,6 @@ __device__ __forceinline__ AccT ILPReduce(int shift, T *data, int size, const Re
  }
  offset = size - last + threadIdx.x;
  for (; offset < size; offset += blockDim.x) threadVal = r(threadVal, data[offset]);
-
  return threadVal;
 }

@ -350,7 +327,6 @@ __global__ void SoftMaxWarpForward(output_t *dst, const input_t *src, int batch_
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;

  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
-
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;
  int local_idx = threadIdx.x;
@ -449,7 +425,6 @@ __global__ void SoftMaxWarpForward(output_t *dst, const input_t *src, int batch_
    }
  }
  warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, atomic::Add>(sum);
-
 #pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
@ -472,17 +447,15 @@ __global__ void SoftMaxWarpForward(output_t *dst, const input_t *src, int batch_
  }
 }

-template <int InsP, typename T, typename accumulate_t>
+template <int InsP, typename T, typename accumulate_t, bool is_log_softmax>
 __global__ void cunn_SoftMaxForward(T *output, T *input, int classes) {
  extern __shared__ unsigned char smem[];
  auto sdata = reinterpret_cast<accumulate_t *>(smem);

-  using LoadT = aligned_vector<T>;
-  using StoreT = aligned_vector<T>;
-
+  using loadT = aligned_vector<T>;
+  using storeT = aligned_vector<T>;
  input += blockIdx.x * classes;
  output += blockIdx.x * classes;
-
  const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(T);
  const int output_shift = ((uint64_t)output) % ALIGN_BYTES / sizeof(T);

@ -496,14 +469,13 @@ __global__ void cunn_SoftMaxForward(T *output, T *input, int classes) {
  accumulate_t sumAll = ReduceBlock<atomic::Add, accumulate_t>(sdata, threadExp, static_cast<accumulate_t>(0));

  if (shift == output_shift) {
-    WriteResultsVectorized<InsP, T, accumulate_t, T>(classes, shift, input, output, max_k, sumAll);
+    WriteResultsVectorized<InsP, T, accumulate_t, T, is_log_softmax>(classes, shift, input, output, max_k, sumAll);
  } else {
-    WriteResults<InsP, T, accumulate_t, T>(classes, input, output, max_k, sumAll);
+    WriteResults<InsP, T, accumulate_t, T, is_log_softmax>(classes, input, output, max_k, sumAll);
  }
 }

 // end of kernel function
-
 template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked>
 void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride,
                              int batch_count, cudaStream_t stream, const bool *mask = nullptr, int chunk_size = -1,
@ -567,16 +539,16 @@ void Softmax(T *input_, T *output_, size_t dim_size_, size_t outer_size_, size_t
    } else {
      constexpr int InsP = sizeof(float4) / sizeof(T);
      dim3 block = SoftMaxGetBlockSize(InsP, dim_size_);
-      cunn_SoftMaxForward<InsP, T, accumulate_t>
+      cunn_SoftMaxForward<InsP, T, accumulate_t, is_log_softmax>
        <<<grid, block, block.x * sizeof(T), cuda_stream>>>(output_, input_, dim_size_);
      CUDA_CHECK();
    }
  } else {
    uint32_t smem_size;
    dim3 grid, block;
-    SpatialSoftMax_getLaunchSizes<T>(&SpatialSoftMaxForward<T, accumulate_t, T>, outer_size_, dim_size_, inner_size_,
-                                     &grid, &block, &smem_size, device_id);
-    SpatialSoftMaxForward<T, accumulate_t, T>
+    SpatialSoftMaxGetLaunchSizes<T>(&SpatialSoftMaxForward<T, accumulate_t, T, is_log_softmax>, outer_size_, dim_size_,
+                                    inner_size_, &grid, &block, &smem_size, device_id);
+    SpatialSoftMaxForward<T, accumulate_t, T, is_log_softmax>
      <<<grid, block, smem_size, cuda_stream>>>(output_, input_, outer_size_, dim_size_, inner_size_);
    CUDA_CHECK();
  }
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softmax_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softmax_impl.cuh
@ -37,14 +37,14 @@ struct AccumulateType<double, true> {
 template <typename T, bool is_cuda>
 using acc_type = typename AccumulateType<T, is_cuda>::type;

-template <typename T, typename AccumT, typename OutT>
+template <typename T, typename AccumT, typename OutT, bool is_log_softmax>
 struct SoftMaxForwardEpilogue {
-  __device__ __forceinline__ SoftMaxForwardEpilogue(AccumT max_input, AccumT sum) : max_input(max_input), sum(sum) {}
-
+  __device__ __forceinline__ SoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
+      : max_input(max_input), sum(is_log_softmax == true ? std::log(sum) : sum) {}
  __device__ __forceinline__ OutT operator()(T input) const {
-    return static_cast<OutT>(std::exp((AccumT)input - max_input) / sum);
+    return is_log_softmax == true ? static_cast<OutT>((AccumT)input - max_input - sum)
+                                  : static_cast<OutT>(std::exp((AccumT)input - max_input) / sum);
  }
-
  const AccumT max_input;
  const AccumT sum;
 };