!40103 adape cuda.ops code for msvc(del CUDA_LIB_EXPORT from template func)

Merge pull request !40103 from chenminmin/master
This commit is contained in:
i-robot 2022-08-10 10:41:58 +00:00 committed by Gitee
commit 05ba391e4c
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
16 changed files with 63 additions and 62 deletions

View File

@ -30,10 +30,10 @@ __global__ void AdaptiveMaxPool2DGradKernel(const T *input_data, const S *max_in
}
template <typename T, typename S>
CUDA_LIB_EXPORT void CalAdaptiveMaxPool2DGrad(const T *input_data, const S *max_index, const int n, const int c,
const uint input_height, const uint input_width,
const uint output_height, const uint output_width, T *output_data,
const uint32_t &device_id, cudaStream_t cuda_stream) {
void CalAdaptiveMaxPool2DGrad(const T *input_data, const S *max_index, const int n, const int c,
const uint input_height, const uint input_width,
const uint output_height, const uint output_width, T *output_data,
const uint32_t &device_id, cudaStream_t cuda_stream) {
const int input_hw = input_height * input_width;
const int input_chw = c * input_hw;
const int input_nchw = n * input_chw;

View File

@ -194,14 +194,14 @@ __global__ void ComputeInferenceGradsKernel(const T *dy, const T *x, const float
}
template <typename T>
CUDA_LIB_EXPORT void BatchNormGradGradTraining(const T *dy, const T *x, const float *scale, const float *mean,
const float *variance, const T *dout_dx, const float *dout_dscale,
const float *dout_dbias, T *ddy, T *dx, float *dscale, float *inv_std,
float *x_hat, float *mean_dy, float *mean_dout_dx,
float *mean_dy_mul_x_hat, float *mean_dout_dx_mul_x_hat,
float *mean_dy_mul_dout_dx, const ShapeInfo &shape_info,
DataFormat format, float epsilon, uint32_t device_id,
cudaStream_t stream) {
void BatchNormGradGradTraining(const T *dy, const T *x, const float *scale, const float *mean,
const float *variance, const T *dout_dx, const float *dout_dscale,
const float *dout_dbias, T *ddy, T *dx, float *dscale, float *inv_std,
float *x_hat, float *mean_dy, float *mean_dout_dx,
float *mean_dy_mul_x_hat, float *mean_dout_dx_mul_x_hat,
float *mean_dy_mul_dout_dx, const ShapeInfo &shape_info,
DataFormat format, float epsilon, uint32_t device_id,
cudaStream_t stream) {
size_t size = shape_info.n * shape_info.c * shape_info.h * shape_info.w;
ComputeInvStdKernel<<<CUDA_BLOCKS(device_id, shape_info.c), CUDA_THREADS(device_id), 0, stream>>>(
variance, shape_info, epsilon, inv_std);
@ -220,11 +220,11 @@ CUDA_LIB_EXPORT void BatchNormGradGradTraining(const T *dy, const T *x, const fl
}
template <typename T>
CUDA_LIB_EXPORT void BatchNormGradGradInference(const T *dy, const T *x, const float *scale, const float *mean,
const float *variance, const T *dout_dx, const float *dout_dscale,
const float *dout_dbias, T *ddy, T *dx, float *dscale, float *inv_std,
float *tmp, const ShapeInfo &shape_info, DataFormat format,
float epsilon, uint32_t device_id, cudaStream_t stream) {
void BatchNormGradGradInference(const T *dy, const T *x, const float *scale, const float *mean,
const float *variance, const T *dout_dx, const float *dout_dscale,
const float *dout_dbias, T *ddy, T *dx, float *dscale, float *inv_std,
float *tmp, const ShapeInfo &shape_info, DataFormat format,
float epsilon, uint32_t device_id, cudaStream_t stream) {
size_t size = shape_info.n * shape_info.c * shape_info.h * shape_info.w;
ComputeInvStdKernel<<<CUDA_BLOCKS(device_id, shape_info.c), CUDA_THREADS(device_id), 0, stream>>>(
variance, shape_info, epsilon, inv_std);

View File

@ -19,6 +19,8 @@
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
#include "include/cuda_fp16.h"
constexpr float kFloatEplison = 1e-37;
// Basic function
template <typename T>
struct GreaterFunc {

View File

@ -20,8 +20,6 @@
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
const float kFloatEplison = 1e-37;
enum BroadcastOpType {
BROADCAST_TYPE_GREATER = 0,
BROADCAST_TYPE_LESS = 1,

View File

@ -15,7 +15,9 @@
*/
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
#ifndef _MSC_VER
#include <pthread.h>
#endif
#include <unordered_map>
namespace mindspore {

View File

@ -298,9 +298,9 @@ void KernelHelper(BinaryFunctor functor, ValueType init, const ValueType *input_
}
template <typename DataType, typename IndexType>
CUDA_LIB_EXPORT void CumMinMax(CumOpType cum_op_type, const DataType *input_ptr, DataType *value_ptr,
IndexType *index_ptr, size_t outer_size_st, size_t axis_size_st, size_t inner_size_st,
const uint32_t &device_id, cudaStream_t cuda_stream) {
void CumMinMax(CumOpType cum_op_type, const DataType *input_ptr, DataType *value_ptr,
IndexType *index_ptr, size_t outer_size_st, size_t axis_size_st, size_t inner_size_st,
const uint32_t &device_id, cudaStream_t cuda_stream) {
switch (cum_op_type) {
case CUMMIN: {
KernelHelper(BinaryFunctor<thrust::less_equal<DataType>, DataType>{}, NumericMax<DataType>(), input_ptr,

View File

@ -24,7 +24,6 @@ using Complex = mindspore::utils::Complex<T>;
__inline__ __device__ int PermutationOrder(int m, const int *per_batch_pivot) {
int permutation_order = 0;
for (int i = 0; i < m - 1; ++i) {
// Refer to http://icl.cs.utk.edu/lapack-forum/viewtopic.php?f=2&t=340 .
permutation_order += per_batch_pivot[i] != (i + 1);
}
return permutation_order;
@ -81,9 +80,9 @@ __global__ void CalculateDeterminantByLuKernel(const T *lu_input, const int *piv
}
template <typename T>
CUDA_LIB_EXPORT void CalculateDeterminantByLu(const T *lu_input, const int *pivot, int m, int batch_size,
bool is_sign_log_determinant, T *determinant_output, T *sign_output,
const uint32_t &device_id, cudaStream_t cuda_stream) {
void CalculateDeterminantByLu(const T *lu_input, const int *pivot, int m, int batch_size,
bool is_sign_log_determinant, T *determinant_output, T *sign_output,
const uint32_t &device_id, cudaStream_t cuda_stream) {
// Parallelization by batch_size.
CalculateDeterminantByLuKernel<<<CUDA_BLOCKS(device_id, batch_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
lu_input, pivot, m, batch_size, is_sign_log_determinant, determinant_output, sign_output);

View File

@ -22,8 +22,8 @@
#include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
template <typename T>
CUDA_LIB_EXPORT void HistogramFixedWidthKernel(int num_samples, const T *d_samples, const double *d_levels,
int32_t *d_histogram, int64_t num_levels, cudaStream_t cuda_stream) {
void HistogramFixedWidthKernel(int num_samples, const T *d_samples, const double *d_levels,
int32_t *d_histogram, int64_t num_levels, cudaStream_t cuda_stream) {
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
(void)cub::DeviceHistogram::HistogramRange(nullptr, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,

View File

@ -89,8 +89,8 @@ void ApplyLambEraly(const size_t size, T *variable, T *m, T *v, const float *bet
}
template <typename T>
CUDA_LIB_EXPORT void ApplyLambLater(const size_t size, T *variable, const float *lr, const float *update,
const float *trust_ratio, cudaStream_t cuda_stream) {
void ApplyLambLater(const size_t size, T *variable, const float *lr, const float *update,
const float *trust_ratio, cudaStream_t cuda_stream) {
ApplyLambAfterNormKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, lr, update, trust_ratio);
}

View File

@ -67,10 +67,10 @@ __global__ void MaskedSelectKernel(const T *input_ptr, const size_t *index_ptr,
// the i is input shape, the j is mask shape, the o is broadcast shape
template <typename T>
CUDA_LIB_EXPORT void MaskedSelect(const T *input_ptr, const bool *mask_ptr, size_t *index_ptr,
const std::vector<size_t> i, const std::vector<size_t> j, const std::vector<size_t> o,
T *input_broadcast_ptr, bool *mask_broadcast_ptr,
T *output_ptr, cudaStream_t cuda_stream) {
void MaskedSelect(const T *input_ptr, const bool *mask_ptr, size_t *index_ptr,
const std::vector<size_t> i, const std::vector<size_t> j, const std::vector<size_t> o,
T *input_broadcast_ptr, bool *mask_broadcast_ptr,
T *output_ptr, cudaStream_t cuda_stream) {
size_t broadcast_size = o[0] * o[1] * o[2] * o[3] * o[4] * o[5] * o[6];
const T *last_input = nullptr;
const bool *last_mask = nullptr;

View File

@ -25,8 +25,8 @@ __global__ void NextAfterKernel(const size_t size, const T *input1, const T *inp
}
template <typename T>
CUDA_LIB_EXPORT void NextAfter(const size_t size, const T *input1, const T *input2, T *output,
const uint32_t &device_id, cudaStream_t cuda_stream) {
void NextAfter(const size_t size, const T *input1, const T *input2, T *output,
const uint32_t &device_id, cudaStream_t cuda_stream) {
NextAfterKernel<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream
>>>(size, input1, input2, output);
return;

View File

@ -76,9 +76,9 @@ __global__ void NonZeroKernel(IndexType *output_ptr, const size_t *output_size_p
}
template <typename DataType, typename IndexType>
CUDA_LIB_EXPORT void NonZero(const DataType *input_ptr, IndexType *output_ptr, size_t *output_size_ptr,
const std::vector<size_t> &input_shape, size_t input_size, const uint32_t &device_id,
cudaStream_t cuda_stream) {
void NonZero(const DataType *input_ptr, IndexType *output_ptr, size_t *output_size_ptr,
const std::vector<size_t> &input_shape, size_t input_size, const uint32_t &device_id,
cudaStream_t cuda_stream) {
// Set the index (1-D base) for non-zero elements and place them into output.
// To support in place operation later, we use custom output iterator,
// which is inspired by cub library. And output_size_ptr stores the number of non-zero elements.

View File

@ -85,10 +85,10 @@ __global__ void ReverseSequence(const size_t size, const T *input, const S *seq_
}
template <typename T, typename S>
CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim,
const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr,
size_t *input_shape_cum_ptr, size_t shape_size, T *output,
cudaStream_t cuda_stream) {
void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim,
const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr,
size_t *input_shape_cum_ptr, size_t shape_size, T *output,
cudaStream_t cuda_stream) {
ComputeCumShape<<<1, 1, 0, cuda_stream>>>(input_shape_ptr, input_shape_cum_ptr, shape_size);
ReverseSequence<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
size, input, seq_len, batch_dim, seq_dim, cur_pos_arr, input_shape_ptr, input_shape_cum_ptr, shape_size, output);

View File

@ -212,14 +212,14 @@ __global__ void SparseAddKernel(const T *a_indices, const S *a_values, const T *
}
template <typename T, typename S, typename K>
CUDA_LIB_EXPORT void SparseAdd(const T *a_indices, const S *a_values, const T *b_indices, const S *b_values,
T *sum_indices, S *sum_values,
size_t *a_value_index, size_t *b_value_index, bool *is_from_a, S* whole_values,
size_t *place_holder_index, int64_t *indices, bool *threshold_valid,
const size_t a_indices_num, const size_t b_indices_num,
S *res_store_mem, int64_t *sum_count,
const K *threshold, const uint32_t &device_id,
cudaStream_t cuda_stream) {
void SparseAdd(const T *a_indices, const S *a_values, const T *b_indices, const S *b_values,
T *sum_indices, S *sum_values,
size_t *a_value_index, size_t *b_value_index, bool *is_from_a, S* whole_values,
size_t *place_holder_index, int64_t *indices, bool *threshold_valid,
const size_t a_indices_num, const size_t b_indices_num,
S *res_store_mem, int64_t *sum_count,
const K *threshold, const uint32_t &device_id,
cudaStream_t cuda_stream) {
SparseAddKernel<<<GET_BLOCKS(1), 1, 0, cuda_stream>>>(
a_indices, a_values, b_indices, b_values,
sum_indices, sum_values,

View File

@ -54,12 +54,12 @@ __global__ void GpuCopy(IndexType *d_in, IndexType *d_out, int N) {
}
template <typename DataType, typename IndexType>
CUDA_LIB_EXPORT void SparseMatrixSoftmax(int shape_size, int batch_pointers_size, int row_pointers_size,
int col_indices_size, IndexType *x_dense_shape, IndexType *x_batch_pointers,
IndexType *x_row_pointers, IndexType *x_col_indices, DataType *x_values,
IndexType *y_dense_shape, IndexType *y_batch_pointers,
IndexType *y_row_pointers, IndexType *y_col_indices, DataType *softmax,
uint32_t device_id, cudaStream_t cuda_stream) {
void SparseMatrixSoftmax(int shape_size, int batch_pointers_size, int row_pointers_size,
int col_indices_size, IndexType *x_dense_shape, IndexType *x_batch_pointers,
IndexType *x_row_pointers, IndexType *x_col_indices, DataType *x_values,
IndexType *y_dense_shape, IndexType *y_batch_pointers,
IndexType *y_row_pointers, IndexType *y_col_indices, DataType *softmax,
uint32_t device_id, cudaStream_t cuda_stream) {
int threads_per_block = CUDA_THREADS(device_id);
unsigned int grid_num = UP_DIV(row_pointers_size - 1, threads_per_block);

View File

@ -94,11 +94,11 @@ __global__ void SparseSegmentMeanKernel(const DataType *x_ptr, const IndexType *
}
template <typename DataType, typename IndexType>
CUDA_LIB_EXPORT void SparseSegmentMean(const DataType *x_ptr, const IndexType *indices_ptr,
const IndexType *segment_ids_ptr, size_t *segment_pos_ptr, DataType *y_ptr,
size_t outer_size, size_t inner_size, size_t indices_size, size_t segment_size,
size_t x_size, size_t y_size, size_t batch_size, uint32_t device_id,
cudaStream_t cuda_stream) {
void SparseSegmentMean(const DataType *x_ptr, const IndexType *indices_ptr,
const IndexType *segment_ids_ptr, size_t *segment_pos_ptr, DataType *y_ptr,
size_t outer_size, size_t inner_size, size_t indices_size, size_t segment_size,
size_t x_size, size_t y_size, size_t batch_size, uint32_t device_id,
cudaStream_t cuda_stream) {
// Get start position of each segment and set to segment_pos_ptr.
// The last element of segment_pos_ptr must equal to indices_size.
SparseSegmentPosKernel<<<CUDA_BLOCKS(device_id, indices_size + 1), CUDA_THREADS(device_id), 0, cuda_stream>>>(