forked from mindspore-Ecosystem/mindspore
!40103 adape cuda.ops code for msvc(del CUDA_LIB_EXPORT from template func)
Merge pull request !40103 from chenminmin/master
This commit is contained in:
commit
05ba391e4c
|
@ -30,10 +30,10 @@ __global__ void AdaptiveMaxPool2DGradKernel(const T *input_data, const S *max_in
|
|||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
CUDA_LIB_EXPORT void CalAdaptiveMaxPool2DGrad(const T *input_data, const S *max_index, const int n, const int c,
|
||||
const uint input_height, const uint input_width,
|
||||
const uint output_height, const uint output_width, T *output_data,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
void CalAdaptiveMaxPool2DGrad(const T *input_data, const S *max_index, const int n, const int c,
|
||||
const uint input_height, const uint input_width,
|
||||
const uint output_height, const uint output_width, T *output_data,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
const int input_hw = input_height * input_width;
|
||||
const int input_chw = c * input_hw;
|
||||
const int input_nchw = n * input_chw;
|
||||
|
|
|
@ -194,14 +194,14 @@ __global__ void ComputeInferenceGradsKernel(const T *dy, const T *x, const float
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void BatchNormGradGradTraining(const T *dy, const T *x, const float *scale, const float *mean,
|
||||
const float *variance, const T *dout_dx, const float *dout_dscale,
|
||||
const float *dout_dbias, T *ddy, T *dx, float *dscale, float *inv_std,
|
||||
float *x_hat, float *mean_dy, float *mean_dout_dx,
|
||||
float *mean_dy_mul_x_hat, float *mean_dout_dx_mul_x_hat,
|
||||
float *mean_dy_mul_dout_dx, const ShapeInfo &shape_info,
|
||||
DataFormat format, float epsilon, uint32_t device_id,
|
||||
cudaStream_t stream) {
|
||||
void BatchNormGradGradTraining(const T *dy, const T *x, const float *scale, const float *mean,
|
||||
const float *variance, const T *dout_dx, const float *dout_dscale,
|
||||
const float *dout_dbias, T *ddy, T *dx, float *dscale, float *inv_std,
|
||||
float *x_hat, float *mean_dy, float *mean_dout_dx,
|
||||
float *mean_dy_mul_x_hat, float *mean_dout_dx_mul_x_hat,
|
||||
float *mean_dy_mul_dout_dx, const ShapeInfo &shape_info,
|
||||
DataFormat format, float epsilon, uint32_t device_id,
|
||||
cudaStream_t stream) {
|
||||
size_t size = shape_info.n * shape_info.c * shape_info.h * shape_info.w;
|
||||
ComputeInvStdKernel<<<CUDA_BLOCKS(device_id, shape_info.c), CUDA_THREADS(device_id), 0, stream>>>(
|
||||
variance, shape_info, epsilon, inv_std);
|
||||
|
@ -220,11 +220,11 @@ CUDA_LIB_EXPORT void BatchNormGradGradTraining(const T *dy, const T *x, const fl
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void BatchNormGradGradInference(const T *dy, const T *x, const float *scale, const float *mean,
|
||||
const float *variance, const T *dout_dx, const float *dout_dscale,
|
||||
const float *dout_dbias, T *ddy, T *dx, float *dscale, float *inv_std,
|
||||
float *tmp, const ShapeInfo &shape_info, DataFormat format,
|
||||
float epsilon, uint32_t device_id, cudaStream_t stream) {
|
||||
void BatchNormGradGradInference(const T *dy, const T *x, const float *scale, const float *mean,
|
||||
const float *variance, const T *dout_dx, const float *dout_dscale,
|
||||
const float *dout_dbias, T *ddy, T *dx, float *dscale, float *inv_std,
|
||||
float *tmp, const ShapeInfo &shape_info, DataFormat format,
|
||||
float epsilon, uint32_t device_id, cudaStream_t stream) {
|
||||
size_t size = shape_info.n * shape_info.c * shape_info.h * shape_info.w;
|
||||
ComputeInvStdKernel<<<CUDA_BLOCKS(device_id, shape_info.c), CUDA_THREADS(device_id), 0, stream>>>(
|
||||
variance, shape_info, epsilon, inv_std);
|
||||
|
|
|
@ -19,6 +19,8 @@
|
|||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
|
||||
#include "include/cuda_fp16.h"
|
||||
|
||||
constexpr float kFloatEplison = 1e-37;
|
||||
|
||||
// Basic function
|
||||
template <typename T>
|
||||
struct GreaterFunc {
|
||||
|
|
|
@ -20,8 +20,6 @@
|
|||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
|
||||
|
||||
const float kFloatEplison = 1e-37;
|
||||
|
||||
enum BroadcastOpType {
|
||||
BROADCAST_TYPE_GREATER = 0,
|
||||
BROADCAST_TYPE_LESS = 1,
|
||||
|
|
|
@ -15,7 +15,9 @@
|
|||
*/
|
||||
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
|
||||
#ifndef _MSC_VER
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
#include <unordered_map>
|
||||
|
||||
namespace mindspore {
|
||||
|
|
|
@ -298,9 +298,9 @@ void KernelHelper(BinaryFunctor functor, ValueType init, const ValueType *input_
|
|||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
CUDA_LIB_EXPORT void CumMinMax(CumOpType cum_op_type, const DataType *input_ptr, DataType *value_ptr,
|
||||
IndexType *index_ptr, size_t outer_size_st, size_t axis_size_st, size_t inner_size_st,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
void CumMinMax(CumOpType cum_op_type, const DataType *input_ptr, DataType *value_ptr,
|
||||
IndexType *index_ptr, size_t outer_size_st, size_t axis_size_st, size_t inner_size_st,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
switch (cum_op_type) {
|
||||
case CUMMIN: {
|
||||
KernelHelper(BinaryFunctor<thrust::less_equal<DataType>, DataType>{}, NumericMax<DataType>(), input_ptr,
|
||||
|
|
|
@ -24,7 +24,6 @@ using Complex = mindspore::utils::Complex<T>;
|
|||
__inline__ __device__ int PermutationOrder(int m, const int *per_batch_pivot) {
|
||||
int permutation_order = 0;
|
||||
for (int i = 0; i < m - 1; ++i) {
|
||||
// Refer to http://icl.cs.utk.edu/lapack-forum/viewtopic.php?f=2&t=340 .
|
||||
permutation_order += per_batch_pivot[i] != (i + 1);
|
||||
}
|
||||
return permutation_order;
|
||||
|
@ -81,9 +80,9 @@ __global__ void CalculateDeterminantByLuKernel(const T *lu_input, const int *piv
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void CalculateDeterminantByLu(const T *lu_input, const int *pivot, int m, int batch_size,
|
||||
bool is_sign_log_determinant, T *determinant_output, T *sign_output,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
void CalculateDeterminantByLu(const T *lu_input, const int *pivot, int m, int batch_size,
|
||||
bool is_sign_log_determinant, T *determinant_output, T *sign_output,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
// Parallelization by batch_size.
|
||||
CalculateDeterminantByLuKernel<<<CUDA_BLOCKS(device_id, batch_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
|
||||
lu_input, pivot, m, batch_size, is_sign_log_determinant, determinant_output, sign_output);
|
||||
|
|
|
@ -22,8 +22,8 @@
|
|||
#include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void HistogramFixedWidthKernel(int num_samples, const T *d_samples, const double *d_levels,
|
||||
int32_t *d_histogram, int64_t num_levels, cudaStream_t cuda_stream) {
|
||||
void HistogramFixedWidthKernel(int num_samples, const T *d_samples, const double *d_levels,
|
||||
int32_t *d_histogram, int64_t num_levels, cudaStream_t cuda_stream) {
|
||||
void *d_temp_storage = nullptr;
|
||||
size_t temp_storage_bytes = 0;
|
||||
(void)cub::DeviceHistogram::HistogramRange(nullptr, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
|
||||
|
|
|
@ -89,8 +89,8 @@ void ApplyLambEraly(const size_t size, T *variable, T *m, T *v, const float *bet
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void ApplyLambLater(const size_t size, T *variable, const float *lr, const float *update,
|
||||
const float *trust_ratio, cudaStream_t cuda_stream) {
|
||||
void ApplyLambLater(const size_t size, T *variable, const float *lr, const float *update,
|
||||
const float *trust_ratio, cudaStream_t cuda_stream) {
|
||||
ApplyLambAfterNormKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, lr, update, trust_ratio);
|
||||
}
|
||||
|
||||
|
|
|
@ -67,10 +67,10 @@ __global__ void MaskedSelectKernel(const T *input_ptr, const size_t *index_ptr,
|
|||
|
||||
// the i is input shape, the j is mask shape, the o is broadcast shape
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void MaskedSelect(const T *input_ptr, const bool *mask_ptr, size_t *index_ptr,
|
||||
const std::vector<size_t> i, const std::vector<size_t> j, const std::vector<size_t> o,
|
||||
T *input_broadcast_ptr, bool *mask_broadcast_ptr,
|
||||
T *output_ptr, cudaStream_t cuda_stream) {
|
||||
void MaskedSelect(const T *input_ptr, const bool *mask_ptr, size_t *index_ptr,
|
||||
const std::vector<size_t> i, const std::vector<size_t> j, const std::vector<size_t> o,
|
||||
T *input_broadcast_ptr, bool *mask_broadcast_ptr,
|
||||
T *output_ptr, cudaStream_t cuda_stream) {
|
||||
size_t broadcast_size = o[0] * o[1] * o[2] * o[3] * o[4] * o[5] * o[6];
|
||||
const T *last_input = nullptr;
|
||||
const bool *last_mask = nullptr;
|
||||
|
|
|
@ -25,8 +25,8 @@ __global__ void NextAfterKernel(const size_t size, const T *input1, const T *inp
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void NextAfter(const size_t size, const T *input1, const T *input2, T *output,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
void NextAfter(const size_t size, const T *input1, const T *input2, T *output,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
NextAfterKernel<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream
|
||||
>>>(size, input1, input2, output);
|
||||
return;
|
||||
|
|
|
@ -76,9 +76,9 @@ __global__ void NonZeroKernel(IndexType *output_ptr, const size_t *output_size_p
|
|||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
CUDA_LIB_EXPORT void NonZero(const DataType *input_ptr, IndexType *output_ptr, size_t *output_size_ptr,
|
||||
const std::vector<size_t> &input_shape, size_t input_size, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream) {
|
||||
void NonZero(const DataType *input_ptr, IndexType *output_ptr, size_t *output_size_ptr,
|
||||
const std::vector<size_t> &input_shape, size_t input_size, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream) {
|
||||
// Set the index (1-D base) for non-zero elements and place them into output.
|
||||
// To support in place operation later, we use custom output iterator,
|
||||
// which is inspired by cub library. And output_size_ptr stores the number of non-zero elements.
|
||||
|
|
|
@ -85,10 +85,10 @@ __global__ void ReverseSequence(const size_t size, const T *input, const S *seq_
|
|||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim,
|
||||
const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr,
|
||||
size_t *input_shape_cum_ptr, size_t shape_size, T *output,
|
||||
cudaStream_t cuda_stream) {
|
||||
void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim,
|
||||
const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr,
|
||||
size_t *input_shape_cum_ptr, size_t shape_size, T *output,
|
||||
cudaStream_t cuda_stream) {
|
||||
ComputeCumShape<<<1, 1, 0, cuda_stream>>>(input_shape_ptr, input_shape_cum_ptr, shape_size);
|
||||
ReverseSequence<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
|
||||
size, input, seq_len, batch_dim, seq_dim, cur_pos_arr, input_shape_ptr, input_shape_cum_ptr, shape_size, output);
|
||||
|
|
|
@ -212,14 +212,14 @@ __global__ void SparseAddKernel(const T *a_indices, const S *a_values, const T *
|
|||
}
|
||||
|
||||
template <typename T, typename S, typename K>
|
||||
CUDA_LIB_EXPORT void SparseAdd(const T *a_indices, const S *a_values, const T *b_indices, const S *b_values,
|
||||
T *sum_indices, S *sum_values,
|
||||
size_t *a_value_index, size_t *b_value_index, bool *is_from_a, S* whole_values,
|
||||
size_t *place_holder_index, int64_t *indices, bool *threshold_valid,
|
||||
const size_t a_indices_num, const size_t b_indices_num,
|
||||
S *res_store_mem, int64_t *sum_count,
|
||||
const K *threshold, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream) {
|
||||
void SparseAdd(const T *a_indices, const S *a_values, const T *b_indices, const S *b_values,
|
||||
T *sum_indices, S *sum_values,
|
||||
size_t *a_value_index, size_t *b_value_index, bool *is_from_a, S* whole_values,
|
||||
size_t *place_holder_index, int64_t *indices, bool *threshold_valid,
|
||||
const size_t a_indices_num, const size_t b_indices_num,
|
||||
S *res_store_mem, int64_t *sum_count,
|
||||
const K *threshold, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream) {
|
||||
SparseAddKernel<<<GET_BLOCKS(1), 1, 0, cuda_stream>>>(
|
||||
a_indices, a_values, b_indices, b_values,
|
||||
sum_indices, sum_values,
|
||||
|
|
|
@ -54,12 +54,12 @@ __global__ void GpuCopy(IndexType *d_in, IndexType *d_out, int N) {
|
|||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
CUDA_LIB_EXPORT void SparseMatrixSoftmax(int shape_size, int batch_pointers_size, int row_pointers_size,
|
||||
int col_indices_size, IndexType *x_dense_shape, IndexType *x_batch_pointers,
|
||||
IndexType *x_row_pointers, IndexType *x_col_indices, DataType *x_values,
|
||||
IndexType *y_dense_shape, IndexType *y_batch_pointers,
|
||||
IndexType *y_row_pointers, IndexType *y_col_indices, DataType *softmax,
|
||||
uint32_t device_id, cudaStream_t cuda_stream) {
|
||||
void SparseMatrixSoftmax(int shape_size, int batch_pointers_size, int row_pointers_size,
|
||||
int col_indices_size, IndexType *x_dense_shape, IndexType *x_batch_pointers,
|
||||
IndexType *x_row_pointers, IndexType *x_col_indices, DataType *x_values,
|
||||
IndexType *y_dense_shape, IndexType *y_batch_pointers,
|
||||
IndexType *y_row_pointers, IndexType *y_col_indices, DataType *softmax,
|
||||
uint32_t device_id, cudaStream_t cuda_stream) {
|
||||
int threads_per_block = CUDA_THREADS(device_id);
|
||||
unsigned int grid_num = UP_DIV(row_pointers_size - 1, threads_per_block);
|
||||
|
||||
|
|
|
@ -94,11 +94,11 @@ __global__ void SparseSegmentMeanKernel(const DataType *x_ptr, const IndexType *
|
|||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
CUDA_LIB_EXPORT void SparseSegmentMean(const DataType *x_ptr, const IndexType *indices_ptr,
|
||||
const IndexType *segment_ids_ptr, size_t *segment_pos_ptr, DataType *y_ptr,
|
||||
size_t outer_size, size_t inner_size, size_t indices_size, size_t segment_size,
|
||||
size_t x_size, size_t y_size, size_t batch_size, uint32_t device_id,
|
||||
cudaStream_t cuda_stream) {
|
||||
void SparseSegmentMean(const DataType *x_ptr, const IndexType *indices_ptr,
|
||||
const IndexType *segment_ids_ptr, size_t *segment_pos_ptr, DataType *y_ptr,
|
||||
size_t outer_size, size_t inner_size, size_t indices_size, size_t segment_size,
|
||||
size_t x_size, size_t y_size, size_t batch_size, uint32_t device_id,
|
||||
cudaStream_t cuda_stream) {
|
||||
// Get start position of each segment and set to segment_pos_ptr.
|
||||
// The last element of segment_pos_ptr must equal to indices_size.
|
||||
SparseSegmentPosKernel<<<CUDA_BLOCKS(device_id, indices_size + 1), CUDA_THREADS(device_id), 0, cuda_stream>>>(
|
||||
|
|
Loading…
Reference in New Issue