From e1b31c7baa708b684e2954b76f33dcd4e055e8c0 Mon Sep 17 00:00:00 2001 From: baihuawei Date: Tue, 22 Sep 2020 08:24:17 +0800 Subject: [PATCH] add GPU CTCLoss --- .../gpu/cuda_impl/ctcloss_impl.cu | 446 ++++++++++++++++++ .../gpu/cuda_impl/ctcloss_impl.cuh | 51 ++ .../gpu/nn/ctcloss_gpu_kernel.cc | 62 +-- .../gpu/nn/ctcloss_gpu_kernel.h | 425 +++++++++-------- .../gpu/nn/ctclossv2_gpu_kernel.cc | 31 ++ .../gpu/nn/ctclossv2_gpu_kernel.h | 192 ++++++++ 6 files changed, 984 insertions(+), 223 deletions(-) create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ctcloss_impl.cu create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ctcloss_impl.cuh create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctclossv2_gpu_kernel.cc create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctclossv2_gpu_kernel.h diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ctcloss_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ctcloss_impl.cu new file mode 100644 index 0000000000..2de65bbf85 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ctcloss_impl.cu @@ -0,0 +1,446 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "ctcloss_impl.cuh" +#include "runtime/device/gpu/cuda_common.h" +template +__device__ T LogSumExp(const T logprob1, const T logprob2) { + if (logprob1 == logprob2 && logprob1 == -std::numeric_limits::infinity()) { + return logprob1; + } else { + return (logprob1 > logprob2) ? logprob1 + log1pf(expf(logprob2 - logprob1)) + : logprob2 + log1pf(expf(logprob1 - logprob2)); + } +} + +template +__global__ void CalculateFwdVarKernel(T *log_alpha_b, int *label_value_with_blank, T *softmax_probs, + const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet, + int maxtime, int blank, int *label_squence_length, int *cum_labels_length, + bool ignore_longer_outputs_than_inputs) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch; i += blockDim.x * gridDim.x) { + if (sequence_length[i] == 0 || + (ignore_longer_outputs_than_inputs && label_squence_length[i] > sequence_length[i])) { + } else { + T *log_alpha_b_cur = &log_alpha_b[i * SOffSet * maxtime]; + int *label_value_with_blank_cur = &label_value_with_blank[0]; + if (i > 0) { + label_value_with_blank_cur = &label_value_with_blank[2 * cum_labels_length[i - 1] + i]; + } + int numclass = blank + 1; + int U = 2 * label_squence_length[i] + 1; + int Ti = sequence_length[i]; + int low = 0; + int high = 0; + log_alpha_b_cur[0] = log(softmax_probs[i * numclass + blank]); + int label0 = blank; + if (U > 1) { + label0 = label_value_with_blank_cur[1]; + log_alpha_b_cur[maxtime] = log(softmax_probs[i * numclass + label0]); + } + for (int t = 1; t < Ti; ++t) { + low = 0; + high = U; + int low_limit = U - (2 * (Ti - t)); + int high_limit = 2 * (t + 1); + if (low_limit > low) { + low = low_limit; + } + if (high_limit < U) { + high = high_limit; + } + for (int u = low; u < high; ++u) { + T sum_log_alpha = -std::numeric_limits::infinity(); + if (ctc_merge_repeated || label_value_with_blank_cur[u] == blank) { + sum_log_alpha = log_alpha_b_cur[u * maxtime + t - 1]; + } + if (u > 0) { + sum_log_alpha = LogSumExp(sum_log_alpha, log_alpha_b_cur[(u - 1) * maxtime + t - 1]); + } + if (u > 1) { + const bool matching_labels_merge = + ctc_merge_repeated && (label_value_with_blank_cur[u] == label_value_with_blank_cur[u - 2]); + if (label_value_with_blank_cur[u] != blank && !matching_labels_merge) { + sum_log_alpha = LogSumExp(sum_log_alpha, log_alpha_b_cur[(u - 2) * maxtime + t - 1]); + } + } + log_alpha_b_cur[u * maxtime + t] = + log(softmax_probs[i * numclass + label_value_with_blank_cur[u] + t * numclass * batch]) + sum_log_alpha; + } + } + } + } +} + +template +__global__ void CalculateBwdVarKernel(T *log_beta_b, int *label_value_with_blank, T *softmax_probs, + const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet, + int maxtime, int blank, int *label_squence_length, int *cum_labels_length, + bool ignore_longer_outputs_than_inputs) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch; i += blockDim.x * gridDim.x) { + if (sequence_length[i] == 0 || + (ignore_longer_outputs_than_inputs && label_squence_length[i] > sequence_length[i])) { + } else { + T *log_beta_b_cur = &log_beta_b[i * SOffSet * maxtime]; + int *label_value_with_blank_cur = &label_value_with_blank[0]; + if (i > 0) { + label_value_with_blank_cur = &label_value_with_blank[2 * cum_labels_length[i - 1] + i]; + } + int numclass = blank + 1; + int U = 2 * label_squence_length[i] + 1; + int Ti = sequence_length[i]; + int low = 0; + int high = 0; + if (U > 1) { + for (int u = U - 2; u < U; ++u) { + log_beta_b_cur[u * maxtime + Ti - 1] = 0; + } + } else { + log_beta_b_cur[Ti - 1] = 0; + log_beta_b_cur[Ti - 2] = 0; + } + for (int t = Ti - 2; t >= 0; --t) { + low = 0; + high = U; + int low_limit = U - (2 * (Ti - t)); + int high_limit = 2 * (t + 1); + if (low_limit > low) { + low = low_limit; + } + if (high_limit < U) { + high = high_limit; + } + for (int u = low; u < high; ++u) { + if (ctc_merge_repeated || label_value_with_blank_cur[u] == blank) { + log_beta_b_cur[u * maxtime + t] = LogSumExp( + log_beta_b_cur[u * maxtime + t], + log_beta_b_cur[u * maxtime + t + 1] + + log(softmax_probs[i * numclass + label_value_with_blank_cur[u] + (t + 1) * numclass * batch])); + } + if (u + 1 < U) { + log_beta_b_cur[u * maxtime + t] = LogSumExp( + log_beta_b_cur[u * maxtime + t], + log_beta_b_cur[(u + 1) * maxtime + t + 1] + + log(softmax_probs[i * numclass + label_value_with_blank_cur[u + 1] + (t + 1) * numclass * batch])); + } + if (u + 2 < U) { + const bool matching_labels_merge = + ctc_merge_repeated && (label_value_with_blank_cur[u] == label_value_with_blank_cur[u + 2]); + if (label_value_with_blank_cur[u] != blank && !matching_labels_merge) { + log_beta_b_cur[u * maxtime + t] = LogSumExp( + log_beta_b_cur[u * maxtime + t], + log_beta_b_cur[(u + 2) * maxtime + t + 1] + + log(softmax_probs[i * numclass + label_value_with_blank_cur[u + 2] + (t + 1) * numclass * batch])); + } + } + } + } + } + } +} + +template +__global__ void ProbInitKernel(T *prob_num, int size) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { + prob_num[i] = -std::numeric_limits::infinity(); + } +} +template +__global__ void LogBInitKernel(T *log_b, int log_prob_size) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < log_prob_size; i += blockDim.x * gridDim.x) { + log_b[i] = -std::numeric_limits::infinity(); + } +} + +template +__global__ void CTCLossKernel(T *log_alpha_b, T *log_beta_b, T *softmax_probs, int *label_value_with_blank, int batch, + int SOffSet, int maxtime, int numclass, const int *sequence_length, + int *label_squence_length, int *cum_labels_length, T *cost, T *grads, T *prob_num, + bool ignore_longer_outputs_than_inputs) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch; i += blockDim.x * gridDim.x) { + if (sequence_length[i] == 0 || + (ignore_longer_outputs_than_inputs && label_squence_length[i] > sequence_length[i])) { + } else { + T *grad_cur = &grads[i * numclass]; + const T *softmax_probs_cur = &softmax_probs[i * numclass]; + T *prob_num_cur = &prob_num[i * numclass]; + int U = 2 * label_squence_length[i] + 1; + T log_pzx = -std::numeric_limits::infinity(); + const T *log_alpha_b_cur = &log_alpha_b[i * SOffSet * maxtime]; + const T *log_beta_b_cur = &log_beta_b[i * SOffSet * maxtime]; + int *label_value_with_blank_cur = &label_value_with_blank[0]; + if (i > 0) { + label_value_with_blank_cur = &label_value_with_blank[2 * cum_labels_length[i - 1] + i]; + } + for (int u = 0; u < U; ++u) { + log_pzx = LogSumExp(log_pzx, log_alpha_b_cur[u * maxtime] + log_beta_b_cur[u * maxtime]); + } + cost[i] = -log_pzx; + // grad + int L = numclass; + int Ti = sequence_length[i]; + if (log_pzx == -std::numeric_limits::infinity()) { + for (int t = 0; t < Ti; ++t) { + for (int l = 0; l < L; ++l) { + grad_cur[t * numclass * batch + l] = softmax_probs_cur[t * numclass * batch + l]; + } + } + } else { + for (int t = 0; t < Ti; ++t) { + for (int u = 0; u < U; ++u) { + int l = label_value_with_blank_cur[u]; + prob_num_cur[t * batch * numclass + l] = + LogSumExp(prob_num_cur[t * batch * numclass + l], + log_alpha_b_cur[u * maxtime + t] + log_beta_b_cur[u * maxtime + t]); + } + for (int l = 0; l < L; ++l) { + grad_cur[t * numclass * batch + l] = + softmax_probs_cur[t * numclass * batch + l] - expf(prob_num_cur[t * batch * numclass + l] - log_pzx); + } + } + } + } + } +} + +template +__global__ void InnerSoftMaxKernel(const T *probs, T *softmax_probs, const int *sequence_length, int max_time, + int batch, int numclass) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch * max_time; i += blockDim.x * gridDim.x) { + int k = i / batch; + int m = i % batch; + if (k < sequence_length[m]) { + T maxCoeff = 0.; + T sumCoeff = 0.; + for (int j = i * numclass; j < (i + 1) * numclass; ++j) { + if (probs[j] > maxCoeff) { + maxCoeff = probs[j]; + } + } + for (int j = i * numclass; j < (i + 1) * numclass; ++j) { + sumCoeff += exp(probs[j] - maxCoeff); + softmax_probs[j] = exp(probs[j] - maxCoeff); + } + for (int j = i * numclass; j < (i + 1) * numclass; ++j) { + softmax_probs[j] /= sumCoeff; + } + } + } +} + +__global__ void GenLabelValuePCRKernel(int *label_value_sp, int *label_value_pcr, int *label_squence_length, + int *cum_labels_length, int batch) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch; i += blockDim.x * gridDim.x) { + int L = label_squence_length[i]; + label_squence_length[i] = 0; + int offset = 0; + if (i > 0) { + offset = cum_labels_length[i - 1]; + } + for (int l = offset; l < L; ++l) { + if (l == offset || label_value_sp[l] != label_value_sp[l - 1]) { + label_value_pcr[offset + label_squence_length[i]++] = label_value_sp[l]; + } + } + } +} + +__global__ void UpdateLengthKernel(int *label_squence_length, int *cum_labels_length, int *max_labels_length, + int batch) { + max_labels_length[0] = 0; + for (int i = 0; i < batch; ++i) { + if (label_squence_length[i] > max_labels_length[0]) { + max_labels_length[0] = label_squence_length[i]; + } + if (i == 0) { + cum_labels_length[i] = label_squence_length[i]; + } else { + cum_labels_length[i] = label_squence_length[i] + cum_labels_length[i - 1]; + } + } +} + +template +void CalculateBwdVar(T *log_beta_b, int *label_value_with_blank, T *softmax_probs, const int *sequence_length, + bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, int blank, int *label_squence_length, + int *cum_labels_length, bool ignore_longer_outputs_than_inputs, cudaStream_t stream) { + int log_prob_size = SOffSet * batch * maxtime; + LogBInitKernel<<>>(log_beta_b, log_prob_size); + CalculateBwdVarKernel<<>>( + log_beta_b, label_value_with_blank, softmax_probs, sequence_length, ctc_merge_repeated, batch, SOffSet, maxtime, + blank, label_squence_length, cum_labels_length, ignore_longer_outputs_than_inputs); +} + +template +void CalculateFwdVar(T *log_alpha_b, int *label_value_with_blank, T *softmax_probs, const int *sequence_length, + bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, int blank, int *label_squence_length, + int *cum_labels_length, bool ignore_longer_outputs_than_inputs, cudaStream_t stream) { + int log_prob_size = SOffSet * batch * maxtime; + LogBInitKernel<<>>(log_alpha_b, log_prob_size); + CalculateFwdVarKernel<<>>( + log_alpha_b, label_value_with_blank, softmax_probs, sequence_length, ctc_merge_repeated, batch, SOffSet, maxtime, + blank, label_squence_length, cum_labels_length, ignore_longer_outputs_than_inputs); +} + +template +void InnerSoftMax(const T *probs, T *softmax_probs, const int *sequence_length, int max_time, int batch, int numclass, + cudaStream_t stream) { + InnerSoftMaxKernel<<>>(probs, softmax_probs, sequence_length, + max_time, batch, numclass); +} + +__global__ void GenLabelWithBlankKernel(int *label_value, int *label_value_with_blank, int *label_squence_length, + int *precum_labels_length, int *cum_labels_length, int batch, int blank) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch; i += blockDim.x * gridDim.x) { + int offset = 0; + int offset1 = 0; + if (i > 0) { + offset = 2 * cum_labels_length[i - 1] + i; + offset1 = precum_labels_length[i - 1]; + } + for (int j = 0; j < label_squence_length[i]; ++j) { + label_value_with_blank[offset + 2 * j] = blank; + label_value_with_blank[offset + 2 * j + 1] = label_value[offset1 + j]; + } + label_value_with_blank[offset + 2 * label_squence_length[i]] = blank; + } +} + +void GenLabelWithBlank(int *label_value, int *label_value_with_blank, int *label_squence_length, + int *precum_labels_length, int *cum_labels_length, int batch, int blank, cudaStream_t stream) { + GenLabelWithBlankKernel<<>>( + label_value, label_value_with_blank, label_squence_length, precum_labels_length, cum_labels_length, batch, blank); +} + +void GenLabelValuePCR(int *label_value_sp, int *label_value_pcr, int *label_squence_length, int *cum_labels_length, + int *max_labels_length, int batch, cudaStream_t stream) { + GenLabelValuePCRKernel<<>>(label_value_sp, label_value_pcr, + label_squence_length, cum_labels_length, batch); + UpdateLengthKernel<<<1, 1, 0, stream>>>(label_squence_length, cum_labels_length, max_labels_length, batch); +} + +__global__ void GenLabelValueKernel(int *label_value_sp, const int64_t *label_indices, const int *label_values, + int *label_squence_length, int *cum_labels_length, int size) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { + int64_t b = label_indices[i * 2]; + int offset = 0; + if (b > 0) { + offset = cum_labels_length[b - 1]; + } + int64_t index = offset + label_indices[i * 2 + 1]; + label_value_sp[index] = label_values[i]; + } +} +__global__ void LabelValueInitKernel(int *label_value_sp, int size, int blank) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { + label_value_sp[i] = blank; + } +} +__global__ void RecalculateLengthKernel(int *label_value_sp, int *label_squence_length, int *cum_labels_length, + int batch, int blank) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch; i += blockDim.x * gridDim.x) { + int offset = 0; + if (i > 0) { + offset = cum_labels_length[i - 1]; + } + int L = label_squence_length[i]; + label_squence_length[i] = 0; + for (int j = offset; j < offset + L; ++j) { + if (label_value_sp[j] >= blank) { + break; + } else { + label_squence_length[i]++; + } + } + } +} +void GenLabelValue(int *label_value_sp, const int64_t *label_indices, const int *label_values, + int *label_squence_length, int *cum_labels_length, int *max_labels_length, int size, int blank, + int batch, cudaStream_t stream) { + LabelValueInitKernel<<>>(label_value_sp, size, blank); + GenLabelValueKernel<<>>(label_value_sp, label_indices, label_values, + label_squence_length, cum_labels_length, size); + RecalculateLengthKernel<<>>(label_value_sp, label_squence_length, + cum_labels_length, batch, blank); + UpdateLengthKernel<<<1, 1, 0, stream>>>(label_squence_length, cum_labels_length, max_labels_length, batch); +} + +__global__ void CalculatePreLengthKernel(int *label_squence_length, int *precum_labels_length, int *cum_labels_length, + int *max_labels_length, const int64_t *label_indices, int batch, int size) { + max_labels_length[0] = 0; + for (int i = 0; i < size; ++i) { + label_squence_length[label_indices[i * 2]]++; + if (max_labels_length[0] < label_indices[i * 2]) { + max_labels_length[0] = label_indices[i * 2]; + } + } + precum_labels_length[0] = label_squence_length[0]; + cum_labels_length[0] = label_squence_length[0]; + for (int i = 1; i < batch; ++i) { + cum_labels_length[i] = cum_labels_length[i - 1] + label_squence_length[i]; + precum_labels_length[i] = precum_labels_length[i - 1] + label_squence_length[i]; + } +} + +__global__ void CalculateMaxSequenceKernel(const int *sequence_length, int *max_labels_length, int batch) { + max_labels_length[0] = 0; + for (int i = 0; i < batch; ++i) { + if (sequence_length[i] > max_labels_length[0]) { + max_labels_length[0] = sequence_length[i]; + } + } +} + +void CalculateMaxSequence(const int *sequence_length, int *max_labels_length, int batch, cudaStream_t stream) { + CalculateMaxSequenceKernel<<<1, 1, 0, stream>>>(sequence_length, max_labels_length, batch); +} + +void CalculatePreLength(int *label_squence_length, int *precum_labels_length, int *cum_labels_length, + int *max_labels_length, const int64_t *label_indices, int batch, int size, + cudaStream_t stream) { + CalculatePreLengthKernel<<<1, 1, 0, stream>>>(label_squence_length, precum_labels_length, cum_labels_length, + max_labels_length, label_indices, batch, size); +} + +template +void CTCLoss(T *log_alpha_b, T *log_beta_b, T *softmax_probs, int *label_value_with_blank, int batch, int SOffSet, + int maxtime, int numclass, const int *sequence_length, int *label_squence_length, int *cum_labels_length, + T *cost, T *grads, T *prob_num, bool ignore_longer_outputs_than_inputs, cudaStream_t stream) { + ProbInitKernel<<>>(prob_num, + maxtime * batch * numclass); + CTCLossKernel<<>>( + log_alpha_b, log_beta_b, softmax_probs, label_value_with_blank, batch, SOffSet, maxtime, numclass, sequence_length, + label_squence_length, cum_labels_length, cost, grads, prob_num, ignore_longer_outputs_than_inputs); +} + +template void CalculateFwdVar(float *log_alpha_b, int *label_value_with_blank, float *softmax_probs, + const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet, + int maxtime, int blank, int *label_squence_length, int *cum_labels_length, + bool ignore_longer_outputs_than_inputs, cudaStream_t stream); + +template void CalculateBwdVar(float *log_beta_b, int *label_value_with_blank, float *softmax_probs, + const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet, + int maxtime, int blank, int *label_squence_length, int *cum_labels_length, + bool ignore_longer_outputs_than_inputs, cudaStream_t stream); + +template void InnerSoftMax(const float *probs, float *softmax_probs, const int *sequence_length, int max_time, + int batch, int numclass, cudaStream_t stream); + +template void CTCLoss(float *log_alpha_b, float *log_beta_b, float *softmax_probs, int *label_value_with_blank, + int batch, int SOffSet, int maxtime, int numclass, const int *sequence_length, + int *label_squence_length, int *cum_labels_length, float *cost, float *grads, + float *prob_num, bool ignore_longer_outputs_than_inputs, cudaStream_t stream); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ctcloss_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ctcloss_impl.cuh new file mode 100644 index 0000000000..7e155ced56 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ctcloss_impl.cuh @@ -0,0 +1,51 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH + +template +void CalculateFwdVar(T *log_alpha_b, int *label_value_with_blank, T *softmax_probs, const int *sequence_length, + bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, int blank, int *label_squence_length, + int *cum_labels_length, bool ignore_longer_outputs_than_inputs, cudaStream_t stream); + +template +void CalculateBwdVar(T *log_beta_b, int *label_value_with_blank, T *softmax_probs, const int *sequence_length, + bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, int blank, int *label_squence_length, + int *cum_labels_length, bool ignore_longer_outputs_than_inputs, cudaStream_t stream); + +template +void InnerSoftMax(const T *probs, T *softmax_cost, const int *sequence_length, int max_time, int batch, int numclass, + cudaStream_t stream); + +void GenLabelValuePCR(int *label_value_sp, int *label_value_pcr, int *label_squence_length, int *cum_labels_length, + int *max_labels_length, int batch, cudaStream_t stream); + +void GenLabelWithBlank(int *label_value, int *label_value_with_blank, int *label_squence_length, + int *precum_labels_length, int *cum_labels_length, int batch, int blank, cudaStream_t stream); + +void GenLabelValue(int *label_value_sp, const int64_t *label_indices, const int *label_values, + int *label_squence_length, int *cum_labels_length, int *max_labels_length, int size, int blank, + int batch, cudaStream_t stream); + +void CalculatePreLength(int *label_squence_length, int *precum_labels_length, int *cum_labels_length, + int *max_labels_length, const int64_t *label_indices, int batch, int size, cudaStream_t stream); +void CalculateMaxSequence(const int *sequence_length, int *max_labels_length, int batch, cudaStream_t stream); +template +void CTCLoss(T *log_alpha_b, T *log_beta_b, T *softmax_probs, int *label_value_with_blank, int batch, int SOffSet, + int maxtime, int numclass, const int *sequence_length, int *label_squence_length, int *cum_labels_length, + T *cost, T *grads, T *prob_num, bool ignore_longer_outputs_than_inputs, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.cc index 71b76cb792..792f60c221 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.cc @@ -1,31 +1,31 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.h" - -namespace mindspore { -namespace kernel { -MS_REG_GPU_KERNEL_ONE(CTCLossV2, - KernelAttr() - .AddInputAttr(kNumberTypeFloat32) - .AddInputAttr(kNumberTypeInt32) - .AddInputAttr(kNumberTypeInt32) - .AddInputAttr(kNumberTypeInt32) - .AddOutputAttr(kNumberTypeFloat32) - .AddOutputAttr(kNumberTypeFloat32), - CtcLossGpuKernel, float) -} // namespace kernel -} // namespace mindspore +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.h" + +namespace mindspore { +namespace kernel { +MS_REG_GPU_KERNEL_ONE(CTCLoss, + KernelAttr() + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeInt64) + .AddInputAttr(kNumberTypeInt32) + .AddInputAttr(kNumberTypeInt32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32), + CtcLossGpuKernel, float) +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.h index ab6d4ae83e..13506cb2bd 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.h @@ -1,192 +1,233 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_CTCLOSS_GPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_CTCLOSS_GPU_KERNEL_H_ - -#include -#include -#include "backend/kernel_compiler/gpu/gpu_kernel.h" -#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" -#include "runtime/device/gpu/gpu_memory_allocator.h" - -namespace mindspore { -namespace kernel { -template -class CtcLossGpuKernel : public GpuKernel { - public: - CtcLossGpuKernel() - : cudnn_handle_(nullptr), - probs_desc_(nullptr), - ctcloss_desc_(nullptr), - label_size_(0), - input_lengths_size_(0), - label_lengths_size_(0) {} - ~CtcLossGpuKernel() override { DestroyResource(); } - - const std::vector &GetInputSizeList() const override { return input_size_list_; } - const std::vector &GetOutputSizeList() const override { return output_size_list_; } - const std::vector &GetWorkspaceSizeList() const override { return workspace_size_list_; } - - bool Launch(const std::vector &inputs, const std::vector &, - const std::vector &outputs, void *stream_ptr) override { - float *probs = GetDeviceAddress(inputs, 0); - float *costs = GetDeviceAddress(outputs, 0); - float *grads = GetDeviceAddress(outputs, 1); - - // Copy labels/input_lengths/label_length to host as cudnn7.x.x requires - int *labels_host = nullptr; - int *no_blank_labels_host = nullptr; - void *input_lengths_host = nullptr; - void *label_lengths_host = nullptr; - cudaStream_t stream = reinterpret_cast(stream_ptr); - AllocHostMem(&labels_host, &no_blank_labels_host, &input_lengths_host, &label_lengths_host, inputs); - CopyToHostSync(labels_host, no_blank_labels_host, input_lengths_host, label_lengths_host, inputs, stream); - - size_t workspace_size = 0; - CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnGetCTCLossWorkspaceSize( - cudnn_handle_, probs_desc_, probs_desc_, reinterpret_cast(no_blank_labels_host), - reinterpret_cast(label_lengths_host), reinterpret_cast(input_lengths_host), - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, ctcloss_desc_, &workspace_size), - "cudnnGetCTCLossWorkspaceSize failed."); - void *workspace = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(workspace_size); - if (workspace == nullptr) { - MS_LOG(EXCEPTION) << "Failed to alloc workspace, size: " << workspace_size; - } - - CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnCTCLoss(cudnn_handle_, probs_desc_, probs, reinterpret_cast(no_blank_labels_host), - reinterpret_cast(label_lengths_host), reinterpret_cast(input_lengths_host), costs, - probs_desc_, grads, CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, ctcloss_desc_, workspace, workspace_size), - "cudnnCtcLoss failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); - - device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(workspace); - FreeHostMem(labels_host, no_blank_labels_host, input_lengths_host, label_lengths_host); - return true; - } - bool Init(const CNodePtr &kernel_node) override { - InitResource(); - auto probs_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); - if (probs_shape.size() != 3) { - MS_LOG(EXCEPTION) << "probs dims: " << probs_shape.size() << " not support."; - } - probs_dims_[0] = probs_shape[0]; - probs_dims_[1] = probs_shape[1]; - probs_dims_[2] = probs_shape[2]; - - auto labels_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); - if (labels_dims.size() != 1 && labels_dims.size() != 2) { - MS_LOG(EXCEPTION) << "labels dims: " << labels_dims.size() << " not support."; - } - label_size_ = sizeof(int); - for (auto i : labels_dims) { - label_size_ *= i; - } - - auto input_length_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); - input_lengths_size_ = input_length_dims[0] * sizeof(int); - auto label_length_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3); - label_lengths_size_ = label_length_dims[0] * sizeof(int); - CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(probs_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 3, probs_dims_), - "cudnnSetTensorNdDescriptorEx failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetCTCLossDescriptorEx(ctcloss_desc_, CUDNN_DATA_FLOAT, - CUDNN_LOSS_NORMALIZATION_SOFTMAX, CUDNN_PROPAGATE_NAN), - "cudnnSetCTCLossDescriptorEx failed."); - InitSizeLists(); - return true; - } - - protected: - void InitResource() override { - cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&probs_desc_), "cudnnCreateTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateCTCLossDescriptor(&ctcloss_desc_), "cudnnCreateCTCLossDescriptor failed."); - } - - void InitSizeLists() override { - input_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(float)); - input_size_list_.push_back(label_size_); - input_size_list_.push_back(input_lengths_size_); - input_size_list_.push_back(label_lengths_size_); - - output_size_list_.push_back(probs_dims_[1] * sizeof(float)); - output_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(float)); - } - - private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyCTCLossDescriptor(ctcloss_desc_), "cudnnDestroyCTCLossDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(probs_desc_), "cudnnDestroyTensorDescriptor failed."); - } - - void AllocHostMem(int **labels_host, int **no_blank_labels_host, void **input_lengths_host, void **label_lengths_host, - const std::vector &inputs) { - CHECK_CUDA_RET_WITH_EXCEPT(cudaMallocHost(labels_host, inputs[1]->size), "cudaMallocHost failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMallocHost(no_blank_labels_host, inputs[1]->size), "cudaMallocHost failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMallocHost(input_lengths_host, inputs[2]->size), "cudaMallocHost failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMallocHost(label_lengths_host, inputs[3]->size), "cudaMallocHost failed."); - } - - void FreeHostMem(int *labels_host, int *no_blank_labels_host, void *input_lengths_host, void *label_lengths_host) { - CHECK_CUDA_RET_WITH_EXCEPT(cudaFreeHost(label_lengths_host), "cudaFreeHost failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaFreeHost(input_lengths_host), "cudaFreeHost failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaFreeHost(labels_host), "cudaFreeHost failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaFreeHost(no_blank_labels_host), "cudaFreeHost failed."); - } - - void CopyToHostSync(int *labels_host, int *no_blank_labels_host, void *input_lengths_host, void *label_lengths_host, - const std::vector &inputs, cudaStream_t stream) { - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); - CHECK_CUDA_RET_WITH_EXCEPT( - cudaMemcpyAsync(labels_host, inputs[1]->addr, inputs[1]->size, cudaMemcpyDeviceToHost, stream), - "cudaMemcpyAsync failed."); - CHECK_CUDA_RET_WITH_EXCEPT( - cudaMemcpyAsync(input_lengths_host, inputs[2]->addr, inputs[2]->size, cudaMemcpyDeviceToHost, stream), - "cudaMemcpyAsync failed."); - CHECK_CUDA_RET_WITH_EXCEPT( - cudaMemcpyAsync(label_lengths_host, inputs[3]->addr, inputs[3]->size, cudaMemcpyDeviceToHost, stream), - "cudaMemcpyAsync failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); - - // remove blank element - size_t j = 0; - for (size_t i = 0; i < inputs[1]->size / sizeof(int); i++) { - if (labels_host[i] != 0) { - no_blank_labels_host[j] = labels_host[i]; - j++; - } - } - } - - std::vector input_size_list_; - std::vector output_size_list_; - std::vector workspace_size_list_; - - cudnnHandle_t cudnn_handle_; - cudnnTensorDescriptor_t probs_desc_; - cudnnCTCLossDescriptor_t ctcloss_desc_; - int probs_dims_[3] = {0}; - int label_size_; - int input_lengths_size_; - int label_lengths_size_; -}; -} // namespace kernel -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_CTCLOSS_GPU_KERNEL_H_ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_CTCLOSS_GPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_CTCLOSS_GPU_KERNEL_H_ + +#include +#include +#include +#include "backend/kernel_compiler/gpu/gpu_kernel.h" +#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" +#include "runtime/device/gpu/gpu_memory_allocator.h" +#include "backend/kernel_compiler/gpu/cuda_impl/ctcloss_impl.cuh" +namespace mindspore { +namespace kernel { +template +class CtcLossGpuKernel : public GpuKernel { + public: + CtcLossGpuKernel() + : label_indice_size_(0), + label_size_(0), + squence_lengths_size_(0), + preprocess_collapse_repeated_(false), + ctc_merge_repeated_(true), + ignore_longer_outputs_than_inputs_(false) {} + ~CtcLossGpuKernel() override = default; + + const std::vector &GetInputSizeList() const override { return input_size_list_; } + const std::vector &GetOutputSizeList() const override { return output_size_list_; } + const std::vector &GetWorkspaceSizeList() const override { return workspace_size_list_; } + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs, void *stream_ptr) override { + cudaStream_t stream = reinterpret_cast(stream_ptr); + const T *probs = GetDeviceAddress(inputs, 0); + const int64_t *label_indices = GetDeviceAddress(inputs, 1); + const int *label_values = GetDeviceAddress(inputs, 2); + const int *sequence_length = GetDeviceAddress(inputs, 3); + T *costs = GetDeviceAddress(outputs, 0); + T *grads = GetDeviceAddress(outputs, 1); + T *softmax_probs = GetDeviceAddress(workspace, 0); + int *cum_labels_length = GetDeviceAddress(workspace, 1); + int *label_squence_length = GetDeviceAddress(workspace, 2); + int *label_value_sp = GetDeviceAddress(workspace, 3); + int *label_value_pcr = GetDeviceAddress(workspace, 4); + T *prob_num = GetDeviceAddress(workspace, 5); + int *precum_labels_length = GetDeviceAddress(workspace, 6); + int *max_labels_length = GetDeviceAddress(workspace, 7); + int numclass = SizeToInt(probs_dims_[2]); + int batch = SizeToInt(probs_dims_[1]); + int max_time = SizeToInt(probs_dims_[0]); + int max_sequence = 0; + CalculateMaxSequence(sequence_length, max_labels_length, batch, stream); + CHECK_CUDA_RET_WITH_EXCEPT( + cudaMemcpyAsync(&max_sequence, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), + "cudaMemcpyAsync failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + if (max_time < max_sequence) { + MS_LOG(EXCEPTION) << "max_time should be greater than sequence length."; + } + InnerSoftMax(probs, softmax_probs, sequence_length, max_time, batch, numclass, stream); + MemsetForWS(label_value_pcr, cum_labels_length, label_squence_length, costs, grads, stream); + int max_labels_length_host = 0; + int batch_label = 0; + int *label_value_with_blank = nullptr; + T *log_alpha_b = nullptr; + T *log_beta_b = nullptr; + CalculatePreLength(label_squence_length, precum_labels_length, cum_labels_length, max_labels_length, label_indices, + batch, label_size_ / sizeof(int), stream); + CHECK_CUDA_RET_WITH_EXCEPT( + cudaMemcpyAsync(&batch_label, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), + "cudaMemcpyAsync failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + if (batch != batch_label + 1) { + MS_LOG(EXCEPTION) << "label batch should be equal to input batch."; + } + GenLabelValue(label_value_sp, label_indices, label_values, label_squence_length, cum_labels_length, + max_labels_length, label_size_ / sizeof(int), numclass - 1, batch, stream); + if (preprocess_collapse_repeated_) { + GenLabelValuePCR(label_value_sp, label_value_pcr, label_squence_length, cum_labels_length, max_labels_length, + batch, stream); + } + CHECK_CUDA_RET_WITH_EXCEPT( + cudaMemcpyAsync(&max_labels_length_host, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), + "cudaMemcpyAsync failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + int SOffSet = 2 * max_labels_length_host + 1; + int log_prob_size = batch * SOffSet * max_time; + if (!ignore_longer_outputs_than_inputs_ && max_labels_length_host > max_time) { + MS_LOG(EXCEPTION) << "output size is greater than input size."; + } + MemManageForCus(&log_alpha_b, &log_beta_b, &label_value_with_blank, cum_labels_length, log_prob_size, batch, + stream); + + if (preprocess_collapse_repeated_) { + GenLabelWithBlank(label_value_pcr, label_value_with_blank, label_squence_length, precum_labels_length, + cum_labels_length, batch, numclass - 1, stream); + } else { + GenLabelWithBlank(label_value_sp, label_value_with_blank, label_squence_length, precum_labels_length, + cum_labels_length, batch, numclass - 1, stream); + } + + CalculateFwdVar(log_alpha_b, label_value_with_blank, softmax_probs, sequence_length, ctc_merge_repeated_, batch, + SOffSet, max_time, numclass - 1, label_squence_length, cum_labels_length, + ignore_longer_outputs_than_inputs_, stream); + CalculateBwdVar(log_beta_b, label_value_with_blank, softmax_probs, sequence_length, ctc_merge_repeated_, batch, + SOffSet, max_time, numclass - 1, label_squence_length, cum_labels_length, + ignore_longer_outputs_than_inputs_, stream); + CTCLoss(log_alpha_b, log_beta_b, softmax_probs, label_value_with_blank, batch, SOffSet, max_time, numclass, + sequence_length, label_squence_length, cum_labels_length, costs, grads, prob_num, + ignore_longer_outputs_than_inputs_, stream); + CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + FreeMem(label_value_with_blank, log_alpha_b, log_beta_b); + return true; + } + bool Init(const CNodePtr &kernel_node) override { + InitResource(); + auto probs_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + if (probs_shape.size() != 3) { + MS_LOG(EXCEPTION) << "probs dims: " << probs_shape.size() << " not support."; + } + probs_dims_[0] = probs_shape[0]; + probs_dims_[1] = probs_shape[1]; + probs_dims_[2] = probs_shape[2]; + auto indice_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); + auto labels_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); + if (labels_dims.size() != 1) { + MS_LOG(EXCEPTION) << "labels dims: " << labels_dims.size() << " not support."; + } + if (indice_dims.size() != 2) { + MS_LOG(EXCEPTION) << "labels indice dims: " << indice_dims.size() << " not support."; + } + label_size_ = sizeof(int); + for (auto i : labels_dims) { + label_size_ *= i; + } + label_indice_size_ = sizeof(int64_t); + for (auto i : indice_dims) { + label_indice_size_ *= i; + } + auto squence_length_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3); + squence_lengths_size_ = squence_length_dims[0] * sizeof(int); + preprocess_collapse_repeated_ = GetAttr(kernel_node, "preprocess_collapse_repeated"); + ctc_merge_repeated_ = GetAttr(kernel_node, "ctc_merge_repeated"); + ignore_longer_outputs_than_inputs_ = GetAttr(kernel_node, "ignore_longer_outputs_than_inputs"); + InitSizeLists(); + return true; + } + + protected: + void InitSizeLists() override { + input_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(T)); + input_size_list_.push_back(label_indice_size_); + input_size_list_.push_back(label_size_); + input_size_list_.push_back(squence_lengths_size_); + workspace_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(T)); + workspace_size_list_.push_back(squence_lengths_size_); + workspace_size_list_.push_back(squence_lengths_size_); + workspace_size_list_.push_back(label_size_); + workspace_size_list_.push_back(label_size_); + workspace_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(T)); + workspace_size_list_.push_back(squence_lengths_size_); + workspace_size_list_.push_back(sizeof(int)); + output_size_list_.push_back(probs_dims_[1] * sizeof(T)); + output_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(T)); + } + void MemsetForWS(int *label_value_pcr, int *cum_labels_length, int *label_squence_length, T *costs, T *grads, + cudaStream_t stream) { + CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(label_value_pcr, static_cast(0), label_size_, stream), + "cudaMemSet failed in CtcLossGpuKernel::Launch."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(cum_labels_length, static_cast(0), squence_lengths_size_, stream), + "cudaMemSet failed in CtcLossGpuKernel::Launch."); + CHECK_CUDA_RET_WITH_EXCEPT( + cudaMemsetAsync(label_squence_length, static_cast(0), squence_lengths_size_, stream), + "cudaMemSet failed in CtcLossGpuKernel::Launch."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(costs, static_cast(0), probs_dims_[1] * sizeof(T), stream), + "cudaMemSet failed in CtcLossGpuKernel::Launch."); + CHECK_CUDA_RET_WITH_EXCEPT( + cudaMemsetAsync(grads, static_cast(0), probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(T), stream), + "cudaMemSet failed in CtcLossGpuKernel::Launch."); + } + void MemManageForCus(T **log_alpha_b, T **log_beta_b, int **label_value_with_blank, int *cum_labels_length, + int log_prob_size, int batch, cudaStream_t stream) { + int total_labels_size_host = 0; + CHECK_CUDA_RET_WITH_EXCEPT(cudaMalloc(reinterpret_cast(log_alpha_b), sizeof(T) * log_prob_size), + "cudaMalloc failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaMalloc(reinterpret_cast(log_beta_b), sizeof(T) * log_prob_size), + "cudaMalloc failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&total_labels_size_host, cum_labels_length + batch - 1, sizeof(int), + cudaMemcpyDeviceToHost, stream), + "cudaMemcpyAsync failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + CHECK_CUDA_RET_WITH_EXCEPT( + cudaMalloc(reinterpret_cast(label_value_with_blank), sizeof(int) * (2 * total_labels_size_host + batch)), + "cudaMalloc failed."); + } + + void FreeMem(int *label_value_with_blank, T *log_alpha_b, T *log_beta_b) { + CHECK_CUDA_RET_WITH_EXCEPT(cudaFree(label_value_with_blank), "cudaFree failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaFree(log_alpha_b), "cudaFree failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaFree(log_beta_b), "cudaFree failed."); + } + + std::vector input_size_list_; + std::vector output_size_list_; + std::vector workspace_size_list_; + + size_t probs_dims_[3] = {0}; + int label_indice_size_; + int label_size_; + int squence_lengths_size_; + bool preprocess_collapse_repeated_; + bool ctc_merge_repeated_; + bool ignore_longer_outputs_than_inputs_; + T kLogZero_ = -std::numeric_limits::infinity(); +}; // namespace kernel +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_CTCLOSS_GPU_KERNEL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctclossv2_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctclossv2_gpu_kernel.cc new file mode 100644 index 0000000000..7006654c8f --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctclossv2_gpu_kernel.cc @@ -0,0 +1,31 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/gpu/nn/ctclossv2_gpu_kernel.h" + +namespace mindspore { +namespace kernel { +MS_REG_GPU_KERNEL_ONE(CTCLossV2, + KernelAttr() + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeInt32) + .AddInputAttr(kNumberTypeInt32) + .AddInputAttr(kNumberTypeInt32) + .AddOutputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32), + CtcLossV2GpuKernel, float) +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctclossv2_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctclossv2_gpu_kernel.h new file mode 100644 index 0000000000..edb0478a4b --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctclossv2_gpu_kernel.h @@ -0,0 +1,192 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_CTCLOSS_GPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_CTCLOSS_GPU_KERNEL_H_ + +#include +#include +#include "backend/kernel_compiler/gpu/gpu_kernel.h" +#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" +#include "runtime/device/gpu/gpu_memory_allocator.h" + +namespace mindspore { +namespace kernel { +template +class CtcLossV2GpuKernel : public GpuKernel { + public: + CtcLossV2GpuKernel() + : cudnn_handle_(nullptr), + probs_desc_(nullptr), + ctcloss_desc_(nullptr), + label_size_(0), + input_lengths_size_(0), + label_lengths_size_(0) {} + ~CtcLossV2GpuKernel() override { DestroyResource(); } + + const std::vector &GetInputSizeList() const override { return input_size_list_; } + const std::vector &GetOutputSizeList() const override { return output_size_list_; } + const std::vector &GetWorkspaceSizeList() const override { return workspace_size_list_; } + + bool Launch(const std::vector &inputs, const std::vector &, + const std::vector &outputs, void *stream_ptr) override { + float *probs = GetDeviceAddress(inputs, 0); + float *costs = GetDeviceAddress(outputs, 0); + float *grads = GetDeviceAddress(outputs, 1); + + // Copy labels/input_lengths/label_length to host as cudnn7.x.x requires + int *labels_host = nullptr; + int *no_blank_labels_host = nullptr; + void *input_lengths_host = nullptr; + void *label_lengths_host = nullptr; + cudaStream_t stream = reinterpret_cast(stream_ptr); + AllocHostMem(&labels_host, &no_blank_labels_host, &input_lengths_host, &label_lengths_host, inputs); + CopyToHostSync(labels_host, no_blank_labels_host, input_lengths_host, label_lengths_host, inputs, stream); + + size_t workspace_size = 0; + CHECK_CUDNN_RET_WITH_EXCEPT( + cudnnGetCTCLossWorkspaceSize( + cudnn_handle_, probs_desc_, probs_desc_, reinterpret_cast(no_blank_labels_host), + reinterpret_cast(label_lengths_host), reinterpret_cast(input_lengths_host), + CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, ctcloss_desc_, &workspace_size), + "cudnnGetCTCLossWorkspaceSize failed."); + void *workspace = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(workspace_size); + if (workspace == nullptr) { + MS_LOG(EXCEPTION) << "Failed to alloc workspace, size: " << workspace_size; + } + + CHECK_CUDNN_RET_WITH_EXCEPT( + cudnnCTCLoss(cudnn_handle_, probs_desc_, probs, reinterpret_cast(no_blank_labels_host), + reinterpret_cast(label_lengths_host), reinterpret_cast(input_lengths_host), costs, + probs_desc_, grads, CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, ctcloss_desc_, workspace, workspace_size), + "cudnnCtcLoss failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + + device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(workspace); + FreeHostMem(labels_host, no_blank_labels_host, input_lengths_host, label_lengths_host); + return true; + } + bool Init(const CNodePtr &kernel_node) override { + InitResource(); + auto probs_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + if (probs_shape.size() != 3) { + MS_LOG(EXCEPTION) << "probs dims: " << probs_shape.size() << " not support."; + } + probs_dims_[0] = probs_shape[0]; + probs_dims_[1] = probs_shape[1]; + probs_dims_[2] = probs_shape[2]; + + auto labels_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); + if (labels_dims.size() != 1 && labels_dims.size() != 2) { + MS_LOG(EXCEPTION) << "labels dims: " << labels_dims.size() << " not support."; + } + label_size_ = sizeof(int); + for (auto i : labels_dims) { + label_size_ *= i; + } + + auto input_length_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); + input_lengths_size_ = input_length_dims[0] * sizeof(int); + auto label_length_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3); + label_lengths_size_ = label_length_dims[0] * sizeof(int); + CHECK_CUDNN_RET_WITH_EXCEPT( + cudnnSetTensorNdDescriptorEx(probs_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 3, probs_dims_), + "cudnnSetTensorNdDescriptorEx failed."); + CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetCTCLossDescriptorEx(ctcloss_desc_, CUDNN_DATA_FLOAT, + CUDNN_LOSS_NORMALIZATION_SOFTMAX, CUDNN_PROPAGATE_NAN), + "cudnnSetCTCLossDescriptorEx failed."); + InitSizeLists(); + return true; + } + + protected: + void InitResource() override { + cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); + CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&probs_desc_), "cudnnCreateTensorDescriptor failed."); + CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateCTCLossDescriptor(&ctcloss_desc_), "cudnnCreateCTCLossDescriptor failed."); + } + + void InitSizeLists() override { + input_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(float)); + input_size_list_.push_back(label_size_); + input_size_list_.push_back(input_lengths_size_); + input_size_list_.push_back(label_lengths_size_); + + output_size_list_.push_back(probs_dims_[1] * sizeof(float)); + output_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(float)); + } + + private: + void DestroyResource() noexcept { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyCTCLossDescriptor(ctcloss_desc_), "cudnnDestroyCTCLossDescriptor failed."); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(probs_desc_), "cudnnDestroyTensorDescriptor failed."); + } + + void AllocHostMem(int **labels_host, int **no_blank_labels_host, void **input_lengths_host, void **label_lengths_host, + const std::vector &inputs) { + CHECK_CUDA_RET_WITH_EXCEPT(cudaMallocHost(labels_host, inputs[1]->size), "cudaMallocHost failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaMallocHost(no_blank_labels_host, inputs[1]->size), "cudaMallocHost failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaMallocHost(input_lengths_host, inputs[2]->size), "cudaMallocHost failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaMallocHost(label_lengths_host, inputs[3]->size), "cudaMallocHost failed."); + } + + void FreeHostMem(int *labels_host, int *no_blank_labels_host, void *input_lengths_host, void *label_lengths_host) { + CHECK_CUDA_RET_WITH_EXCEPT(cudaFreeHost(label_lengths_host), "cudaFreeHost failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaFreeHost(input_lengths_host), "cudaFreeHost failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaFreeHost(labels_host), "cudaFreeHost failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaFreeHost(no_blank_labels_host), "cudaFreeHost failed."); + } + + void CopyToHostSync(int *labels_host, int *no_blank_labels_host, void *input_lengths_host, void *label_lengths_host, + const std::vector &inputs, cudaStream_t stream) { + CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + CHECK_CUDA_RET_WITH_EXCEPT( + cudaMemcpyAsync(labels_host, inputs[1]->addr, inputs[1]->size, cudaMemcpyDeviceToHost, stream), + "cudaMemcpyAsync failed."); + CHECK_CUDA_RET_WITH_EXCEPT( + cudaMemcpyAsync(input_lengths_host, inputs[2]->addr, inputs[2]->size, cudaMemcpyDeviceToHost, stream), + "cudaMemcpyAsync failed."); + CHECK_CUDA_RET_WITH_EXCEPT( + cudaMemcpyAsync(label_lengths_host, inputs[3]->addr, inputs[3]->size, cudaMemcpyDeviceToHost, stream), + "cudaMemcpyAsync failed."); + CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + + // remove blank element + size_t j = 0; + for (size_t i = 0; i < inputs[1]->size / sizeof(int); i++) { + if (labels_host[i] != 0) { + no_blank_labels_host[j] = labels_host[i]; + j++; + } + } + } + + std::vector input_size_list_; + std::vector output_size_list_; + std::vector workspace_size_list_; + + cudnnHandle_t cudnn_handle_; + cudnnTensorDescriptor_t probs_desc_; + cudnnCTCLossDescriptor_t ctcloss_desc_; + int probs_dims_[3] = {0}; + int label_size_; + int input_lengths_size_; + int label_lengths_size_; +}; +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_CTCLOSS_GPU_KERNEL_H_