!19894 topk and intopk float16 bug workaround

Merge pull request !19894 from Peilin/topk-float16-cast-r1.3-fix
2021-07-10 07:10:56 +00:00 · 2021-07-10 07:10:56 +00:00 · 070307634b
parent 9c868ca486 c2071ea2c4
commit 070307634b
5 changed files with 75 additions and 25 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/in_top_k_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/in_top_k_gpu_kernel.h
@ -22,6 +22,7 @@

 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/cast_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"

@ -47,12 +48,29 @@ class InTopKGpuKernel : public GpuKernel {
    T *top_k_output_device = GetDeviceAddress<T>(workspace, 0);
    int32_t *top_k_indices_device = GetDeviceAddress<int32_t>(workspace, 1);

-    // topk sorts the input along the last dimension
-    FastTopK(outer_size_, inner_size_, predictions_device, static_cast<int32_t>(k_), top_k_output_device,
-             top_k_indices_device, top_k_init_, reinterpret_cast<cudaStream_t>(stream_ptr));
+    if (std::is_same<T, half>::value) {
+      // remove later! urgent fix for bug: topk has incorrect output for float16
+      float top_k_init = std::numeric_limits<float>::lowest();

-    CalInTopK(predictions_device, targets_device, output_device, top_k_output_device, input_shape_[0], k_,
-              reinterpret_cast<cudaStream_t>(stream_ptr));
+      // cast to float32
+      float *casted_float32_input = GetDeviceAddress<float>(workspace, 2);
+      float *top_k_output_device_float32 = GetDeviceAddress<float>(workspace, 3);
+
+      Cast(input_size_, predictions_device, casted_float32_input, reinterpret_cast<cudaStream_t>(stream_ptr));
+
+      FastTopK(outer_size_, inner_size_, casted_float32_input, static_cast<int32_t>(k_), top_k_output_device_float32,
+               top_k_indices_device, top_k_init, reinterpret_cast<cudaStream_t>(stream_ptr));
+
+      CalInTopK(casted_float32_input, targets_device, output_device, top_k_output_device_float32, input_shape_[0],
+                input_shape_[1], k_, reinterpret_cast<cudaStream_t>(stream_ptr));
+    } else {
+      // topk sorts the input along the last dimension
+      FastTopK(outer_size_, inner_size_, predictions_device, static_cast<int32_t>(k_), top_k_output_device,
+               top_k_indices_device, top_k_init_, reinterpret_cast<cudaStream_t>(stream_ptr));
+
+      CalInTopK(predictions_device, targets_device, output_device, top_k_output_device, input_shape_[0],
+                input_shape_[1], k_, reinterpret_cast<cudaStream_t>(stream_ptr));
+    }

    return true;
  }
@ -114,6 +132,12 @@ class InTopKGpuKernel : public GpuKernel {
    output_size_list_.push_back(input_shape_[0] * sizeof(bool));
    workspace_size_list_.push_back(input_shape_[0] * k_ * sizeof(T));
    workspace_size_list_.push_back(input_shape_[0] * k_ * sizeof(int32_t));
+
+    // remove later! urgent fix for bug: topk has incorrect output for float16
+    if (std::is_same<T, half>::value) {
+      workspace_size_list_.push_back(input_size_ * sizeof(float));
+      workspace_size_list_.push_back(input_shape_[0] * k_ * sizeof(float));
+    }
  }

 private:
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
@ -21,6 +21,7 @@
 #include <vector>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/cast_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"

 namespace mindspore {
@ -42,20 +43,38 @@ class TopKGpuKernel : public GpuKernel {
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    S *indices = GetDeviceAddress<S>(outputs, 1);

-    T init_k = std::numeric_limits<T>::lowest();
-    if (std::is_same<T, half>::value) {
-      // min value representable by float16, std::numeric_limits doesn't support half
-      init_k = static_cast<half>(-65504.);
-    }
-
    S k_cut = 0;
    CHECK_CUDA_RET_WITH_EXCEPT(
      kernel_node_,
      cudaMemcpyAsync(&k_cut, k, sizeof(S), cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream_ptr)),
      "cudaMemcpyAsync k_cut failed");
    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), "cudaDeviceSyncFailed - TopK");
-    FastTopK(outer_size_, inner_size_, input_addr, k_cut, output_addr, indices, init_k,
-             reinterpret_cast<cudaStream_t>(stream_ptr));
+
+    if (std::is_same<T, half>::value) {
+      // remove later! urgent fix for bug: topk has incorrect output for float16
+      float init_k = std::numeric_limits<float>::lowest();
+
+      // cast to float32
+      float *casted_float32_input = GetDeviceAddress<float>(workspaces, 0);
+      float *casted_float32_top_k_output = GetDeviceAddress<float>(workspaces, 1);
+      Cast(outer_size_ * inner_size_, input_addr, casted_float32_input, reinterpret_cast<cudaStream_t>(stream_ptr));
+
+      // call FastTopK with workspace[n], workspace[n+1] as input, output
+      FastTopK(outer_size_, inner_size_, casted_float32_input, k_cut, casted_float32_top_k_output, indices, init_k,
+               reinterpret_cast<cudaStream_t>(stream_ptr));
+
+      // cast workspace[n+1] back to float16
+      Cast(outer_size_ * k_, casted_float32_top_k_output, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
+    } else {
+      T init_k = std::numeric_limits<T>::lowest();
+      CHECK_CUDA_RET_WITH_EXCEPT(
+        kernel_node_,
+        cudaMemcpyAsync(&k_cut, k, sizeof(S), cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream_ptr)),
+        "cudaMemcpyAsync k_cut failed");
+      CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), "cudaDeviceSyncFailed - TopK");
+      FastTopK(outer_size_, inner_size_, input_addr, k_cut, output_addr, indices, init_k,
+               reinterpret_cast<cudaStream_t>(stream_ptr));
+    }
    return true;
  }

@ -82,6 +101,12 @@ class TopKGpuKernel : public GpuKernel {
    input_size_list_.push_back(sizeof(S));
    output_size_list_.push_back(outer_size_ * k_ * sizeof(T));
    output_size_list_.push_back(outer_size_ * k_ * sizeof(S));
+
+    // remove later! urgent fix for bug: topk has incorrect output for float16
+    if (std::is_same<T, half>::value) {
+      workspace_size_list_.push_back(outer_size_ * inner_size_ * sizeof(float));
+      workspace_size_list_.push_back(outer_size_ * k_ * sizeof(float));
+    }
  }

 private:
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cu
@ -21,9 +21,9 @@

 template <typename T>
 __global__ void InTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output,
-                       size_t class_id_count, int64_t k) {
+                       size_t batch_size, size_t class_id_count, int64_t k) {
  size_t gt_id = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; gt_id < class_id_count; gt_id += blockDim.x * gridDim.x) {
+  for (; gt_id < batch_size; gt_id += blockDim.x * gridDim.x) {
    int32_t target_index = targets[gt_id];
    T predicted_value = predictions[gt_id * class_id_count + target_index];
    T top_k_smallest_value = top_k_output[k - 1];
@ -33,14 +33,15 @@ __global__ void InTopK(const T *predictions, const int32_t *targets, bool *outpu
 }

 template <typename T>
-void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t class_id_count,
-               int64_t k, cudaStream_t cuda_stream) {
+void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t batch_size,
+               size_t class_id_count, int64_t k, cudaStream_t cuda_stream) {
  InTopK<<<GET_BLOCKS(class_id_count), GET_THREADS, 0, cuda_stream>>>(predictions, targets, output, top_k_output,
-                                                                      class_id_count, k);
+                                                                      batch_size, class_id_count, k);
 }

 template void CalInTopK<half>(const half *predictions, const int32_t *targets, bool *output, const half *top_k_output,
-                              size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
+                              size_t batch_size, size_t class_id_count, int64_t k, cudaStream_t cuda_stream);

 template void CalInTopK<float>(const float *predictions, const int32_t *targets, bool *output,
-                               const float *top_k_output, size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
+                               const float *top_k_output, size_t batch_size, size_t class_id_count, int64_t k,
+                               cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cuh
@ -20,7 +20,7 @@
 #include <cuda_runtime.h>

 template <typename T>
-void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t class_id_count,
-               int64_t k, cudaStream_t cuda_stream);
+void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t batch_size,
+               size_t class_id_count, int64_t k, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_IN_TOP_K_CUH_
--- a/tests/st/ops/gpu/test_in_top_k.py
+++ b/tests/st/ops/gpu/test_in_top_k.py
@ -32,9 +32,9 @@ class InTopKNet(nn.Cell):
 def in_top_k(nptype):
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")

-    predictions = Tensor(np.array([[9, 3, 8],
-                                   [7, 9, 9],
-                                   [9, 9, 9]]).astype(nptype))
+    predictions = Tensor(np.array([[9, 3, 8, 0, 0, 0, 0, 0, 0],
+                                   [7, 9, 9, 0, 0, 0, 0, 0, 0],
+                                   [9, 9, 9, 0, 0, 0, 0, 0, 0]]).astype(nptype))

    k = 1
    in_top_k_net = InTopKNet(k)