fixBCE

2020-10-24 16:17:53 +08:00 · 2020-10-24 16:17:53 +08:00 · 651d26b1da
parent c5867d2d7e
commit 651d26b1da
6 changed files with 78 additions and 28 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/loss_with_reduction_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/loss_with_reduction_impl.cu
@ -18,11 +18,18 @@
 #include "loss_with_reduction_impl.cuh"
 #include "runtime/device/gpu/cuda_common.h"

+inline __device__ float logT(float x) { return logf(x); }
+inline __device__ half logT(half x) { return hlog(x); }
+inline __device__ float castT(float ref, int x) { return __int2float_rd(x); }
+inline __device__ half castT(half ref, int x) { return __int2half_rd(x); }
+inline __device__ float maxT(float a, float b) { return fmaxf(a, b); }
+inline __device__ half maxT(half a, half b) { return a > b ? a : b; }
+
 template <typename T>
 __global__ void Copy(T *loss, T *tmp_loss, int reduction, int input_size) {
  loss[0] += tmp_loss[0];
  if (reduction == 1) {
-    loss[0] /= input_size;
+    loss[0] /= castT(loss[0], input_size);
  }
 }

@ -48,14 +55,14 @@ __global__ void KLDivLossKernel(const int input_size, const int reduction, const
  T epsilon = 1e-6;
  if (reduction == 0) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < input_size; i += blockDim.x * gridDim.x) {
-      T denominator = max(input_y[i], epsilon);
-      T value = input_y[i] * (logf(denominator) - input_x[i]);
+      T denominator = maxT(input_y[i], epsilon);
+      T value = input_y[i] * (logT(denominator) - input_x[i]);
      loss[i] = value;
    }
  } else {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < input_size; i += blockDim.x * gridDim.x) {
-      T denominator = max(input_y[i], epsilon);
-      T value = input_y[i] * (logf(denominator) - input_x[i]);
+      T denominator = maxT(input_y[i], epsilon);
+      T value = input_y[i] * (logT(denominator) - input_x[i]);
      tmp_loss[i] = value;
    }
  }
@ -85,21 +92,22 @@ template <typename T>
 __global__ void KLDivLossGradKernel(const int input_size, const int reduction, const T *input_x, const T *input_y,
                                    const T *dloss, T *dx, T *dy) {
  T epsilon = 1e-6;
+  T one = static_cast<T>(1);
  if (reduction == 0) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < input_size; i += blockDim.x * gridDim.x) {
-      T denominator = max(input_y[i], epsilon);
+      T denominator = maxT(input_y[i], epsilon);
      dx[i] = -input_y[i] * dloss[i];
-      dy[i] = (logf(denominator) + 1 - input_x[i]) * dloss[i];
+      dy[i] = (logT(denominator) + one - input_x[i]) * dloss[i];
    }
  } else {
    T dloss1 = dloss[0];
    if (reduction == 1) {
-      dloss1 = dloss[0] / input_size;
+      dloss1 = dloss[0] / castT(dloss[0], input_size);
    }
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < input_size; i += blockDim.x * gridDim.x) {
-      T denominator = max(input_y[i], epsilon);
+      T denominator = maxT(input_y[i], epsilon);
      dx[i] = -input_y[i] * dloss1;
-      dy[i] = (logf(denominator) + 1 - input_x[i]) * dloss1;
+      dy[i] = (logT(denominator) + one - input_x[i]) * dloss1;
    }
  }
 }
@ -115,16 +123,17 @@ template <typename T>
 __global__ void BinaryCrossEntropyLossKernel(const int input_size, const int reduction, const T *input_x,
                                             const T *input_y, const T *weight, T *loss, T *tmp_loss) {
  T epsilon = 1e-12;
+  T one = static_cast<T>(1);
  if (reduction == 0) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < input_size; i += blockDim.x * gridDim.x) {
      T value =
-        -weight[i] * (input_y[i] * logf(input_x[i] + epsilon) + (1 - input_y[i]) * logf(1 - input_x[i] + epsilon));
+        -weight[i] * (input_y[i] * logT(input_x[i] + epsilon) + (one - input_y[i]) * logT(one - input_x[i] + epsilon));
      loss[i] = value;
    }
  } else {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < input_size; i += blockDim.x * gridDim.x) {
      T value =
-        -weight[i] * (input_y[i] * logf(input_x[i] + epsilon) + (1 - input_y[i]) * logf(1 - input_x[i] + epsilon));
+        -weight[i] * (input_y[i] * logT(input_x[i] + epsilon) + (one - input_y[i]) * logT(one - input_x[i] + epsilon));
      tmp_loss[i] = value;
    }
  }
@ -154,19 +163,20 @@ template <typename T>
 __global__ void BinaryCrossEntropyLossGradKernel(const int input_size, const int reduction, const T *input_x,
                                                 const T *input_y, const T *weight, const T *dloss, T *dx) {
  T epsilon = 1e-12;
+  T one = static_cast<T>(1);
  if (reduction == 0) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < input_size; i += blockDim.x * gridDim.x) {
-      T denominator = max(input_x[i] * (1 - input_x[i]), epsilon);
+      T denominator = maxT(input_x[i] * (one - input_x[i]), epsilon);
      T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
      dx[i] = value * dloss[i];
    }
  } else {
    T dloss1 = dloss[0];
    if (reduction == 1) {
-      dloss1 = dloss[0] / input_size;
+      dloss1 = dloss[0] / castT(dloss[0], input_size);
    }
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < input_size; i += blockDim.x * gridDim.x) {
-      T denominator = max(input_x[i] * (1 - input_x[i]), epsilon);
+      T denominator = maxT(input_x[i] * (one - input_x[i]), epsilon);
      T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
      dx[i] = value * dloss1;
    }
@ -180,16 +190,30 @@ void BinaryCrossEntropyLossGrad(const int &input_size, const int &reduction, con
                                                                                       input_y, weight, dloss, dx);
 }

-template void KLDivLoss(const int &input_size, const int &reduction, const float *input_x, const float *input_y,
-                        float *loss, float *tmp_loss, cudaStream_t stream);
+template void KLDivLoss<float>(const int &input_size, const int &reduction, const float *input_x, const float *input_y,
+                               float *loss, float *tmp_loss, cudaStream_t stream);

-template void KLDivLossGrad(const int &input_size, const int &reduction, const float *input_x, const float *input_y,
-                            const float *dloss, float *dx, float *dy, cudaStream_t stream);
+template void KLDivLossGrad<float>(const int &input_size, const int &reduction, const float *input_x,
+                                   const float *input_y, const float *dloss, float *dx, float *dy, cudaStream_t stream);

-template void BinaryCrossEntropyLoss(const int &input_size, const int &reduction, const float *input_x,
-                                     const float *input_y, const float *weight, float *loss, float *tmp_loss,
-                                     cudaStream_t stream);
+template void BinaryCrossEntropyLoss<float>(const int &input_size, const int &reduction, const float *input_x,
+                                            const float *input_y, const float *weight, float *loss, float *tmp_loss,
+                                            cudaStream_t stream);

-template void BinaryCrossEntropyLossGrad(const int &input_size, const int &reduction, const float *input_x,
-                                         const float *input_y, const float *weight, const float *dloss, float *dx,
-                                         cudaStream_t stream);
+template void BinaryCrossEntropyLossGrad<float>(const int &input_size, const int &reduction, const float *input_x,
+                                                const float *input_y, const float *weight, const float *dloss,
+                                                float *dx, cudaStream_t stream);
+
+template void KLDivLoss<half>(const int &input_size, const int &reduction, const half *input_x, const half *input_y,
+                              half *loss, half *tmp_loss, cudaStream_t stream);
+
+template void KLDivLossGrad<half>(const int &input_size, const int &reduction, const half *input_x, const half *input_y,
+                                  const half *dloss, half *dx, half *dy, cudaStream_t stream);
+
+template void BinaryCrossEntropyLoss<half>(const int &input_size, const int &reduction, const half *input_x,
+                                           const half *input_y, const half *weight, half *loss, half *tmp_loss,
+                                           cudaStream_t stream);
+
+template void BinaryCrossEntropyLossGrad<half>(const int &input_size, const int &reduction, const half *input_x,
+                                               const half *input_y, const half *weight, const half *dloss, half *dx,
+                                               cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/binary_cross_entropy_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/binary_cross_entropy_gpu_kernel.cc
@ -24,5 +24,12 @@ MS_REG_GPU_KERNEL_ONE(BinaryCrossEntropy,
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      BinaryCrossEntropyGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(BinaryCrossEntropy,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      BinaryCrossEntropyGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/binary_cross_entropy_grad_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/binary_cross_entropy_grad_kernel.cc
@ -26,5 +26,13 @@ MS_REG_GPU_KERNEL_ONE(BinaryCrossEntropyGrad,
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      BinaryCrossEntropyGradGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(BinaryCrossEntropyGrad,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      BinaryCrossEntropyGradGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/binary_cross_entropy_grad_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/binary_cross_entropy_grad_kernel.h
@ -65,16 +65,15 @@ class BinaryCrossEntropyGradGpuKernel : public GpuKernel {

 protected:
  void InitSizeLists() override {
-    input_size_list_.push_back(input_size_ * sizeof(T));
    input_size_list_.push_back(input_size_ * sizeof(T));
    input_size_list_.push_back(input_size_ * sizeof(T));
    if (reduction_ == 0) {
      input_size_list_.push_back(input_size_ * sizeof(T));
-      output_size_list_.push_back(input_size_ * sizeof(T));
    } else {
      input_size_list_.push_back(sizeof(T));
-      output_size_list_.push_back(sizeof(T));
    }
+    input_size_list_.push_back(input_size_ * sizeof(T));
+    output_size_list_.push_back(input_size_ * sizeof(T));
  }

 private:
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/kl_div_loss_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/kl_div_loss_gpu_kernel.cc
@ -22,5 +22,9 @@ MS_REG_GPU_KERNEL_ONE(
  KLDivLoss,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  KLDivLossGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(
+  KLDivLoss,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  KLDivLossGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/kl_div_loss_grad_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/kl_div_loss_grad_kernel.cc
@ -26,5 +26,13 @@ MS_REG_GPU_KERNEL_ONE(KLDivLossGrad,
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      KLDivLossGradGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(KLDivLossGrad,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      KLDivLossGradGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore