|
|
@ -18,7 +18,7 @@
|
|
|
|
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
|
|
|
|
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
template <typename T, typename S>
|
|
|
|
__global__ void TensorScatterUpdateKernel(const T *input, const S *indices, const T *update, T *output,
|
|
|
|
__global__ void TensorScatterUpdateKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
S *work_shape) {
|
|
|
|
S *work_shape) {
|
|
|
@ -41,12 +41,14 @@ __global__ void TensorScatterUpdateKernel(const T *input, const S *indices, cons
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
if (!out_bound) {
|
|
|
|
if (!out_bound) {
|
|
|
|
output[write_index] = update[read_index];
|
|
|
|
output[write_index] = update[read_index];
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
has_error[0] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
template <typename T, typename S>
|
|
|
|
__global__ void TensorScatterMinKernel(const T *input, const S *indices, const T *update, T *output,
|
|
|
|
__global__ void TensorScatterMinKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
S *work_shape) {
|
|
|
|
S *work_shape) {
|
|
|
@ -69,12 +71,14 @@ __global__ void TensorScatterMinKernel(const T *input, const S *indices, const T
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
if (!out_bound) {
|
|
|
|
if (!out_bound) {
|
|
|
|
(void)MsAtomicMin(&output[write_index], update[read_index]);
|
|
|
|
(void)MsAtomicMin(&output[write_index], update[read_index]);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
has_error[0] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
template <typename T, typename S>
|
|
|
|
__global__ void TensorScatterMaxKernel(const T *input, const S *indices, const T *update, T *output,
|
|
|
|
__global__ void TensorScatterMaxKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
S *work_shape) {
|
|
|
|
S *work_shape) {
|
|
|
@ -97,12 +101,14 @@ __global__ void TensorScatterMaxKernel(const T *input, const S *indices, const T
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
if (!out_bound) {
|
|
|
|
if (!out_bound) {
|
|
|
|
(void)MsAtomicMax(&output[write_index], update[read_index]);
|
|
|
|
(void)MsAtomicMax(&output[write_index], update[read_index]);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
has_error[0] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
template <typename T, typename S>
|
|
|
|
__global__ void TensorScatterAddKernel(const T *input, const S *indices, const T *update, T *output,
|
|
|
|
__global__ void TensorScatterAddKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
S *work_shape) {
|
|
|
|
S *work_shape) {
|
|
|
@ -125,12 +131,14 @@ __global__ void TensorScatterAddKernel(const T *input, const S *indices, const T
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
if (!out_bound) {
|
|
|
|
if (!out_bound) {
|
|
|
|
(void)MsAtomicAdd(&output[write_index], update[read_index]);
|
|
|
|
(void)MsAtomicAdd(&output[write_index], update[read_index]);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
has_error[0] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
template <typename T, typename S>
|
|
|
|
__global__ void TensorScatterSubKernel(const T *input, const S *indices, const T *update, T *output,
|
|
|
|
__global__ void TensorScatterSubKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
S *work_shape) {
|
|
|
|
S *work_shape) {
|
|
|
@ -153,12 +161,14 @@ __global__ void TensorScatterSubKernel(const T *input, const S *indices, const T
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
if (!out_bound) {
|
|
|
|
if (!out_bound) {
|
|
|
|
(void)MsAtomicSub(&output[write_index], update[read_index]);
|
|
|
|
(void)MsAtomicSub(&output[write_index], update[read_index]);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
has_error[0] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
template <typename T, typename S>
|
|
|
|
__global__ void TensorScatterMulKernel(const T *input, const S *indices, const T *update, T *output,
|
|
|
|
__global__ void TensorScatterMulKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
S *work_shape) {
|
|
|
|
S *work_shape) {
|
|
|
@ -181,12 +191,14 @@ __global__ void TensorScatterMulKernel(const T *input, const S *indices, const T
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
if (!out_bound) {
|
|
|
|
if (!out_bound) {
|
|
|
|
(void)MsAtomicMul(&output[write_index], update[read_index]);
|
|
|
|
(void)MsAtomicMul(&output[write_index], update[read_index]);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
has_error[0] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
template <typename T, typename S>
|
|
|
|
__global__ void TensorScatterDivKernel(const T *input, const S *indices, const T *update, T *output,
|
|
|
|
__global__ void TensorScatterDivKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t block_size, const size_t input_size, const size_t output_size,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
|
|
|
|
S *work_shape) {
|
|
|
|
S *work_shape) {
|
|
|
@ -209,44 +221,46 @@ __global__ void TensorScatterDivKernel(const T *input, const S *indices, const T
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
out_bound |= write_index >= output_size;
|
|
|
|
if (!out_bound) {
|
|
|
|
if (!out_bound) {
|
|
|
|
(void)MsAtomicDiv(&output[write_index], update[read_index]);
|
|
|
|
(void)MsAtomicDiv(&output[write_index], update[read_index]);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
has_error[0] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
template <typename T, typename S>
|
|
|
|
void TensorScatterArithmetic(const enum TensorScatterArithmeticFunctionType &func_type, const T *input,
|
|
|
|
void TensorScatterArithmetic(const enum TensorScatterArithmeticFunctionType &func_type, const T *input,
|
|
|
|
const S *indices, const T *update, T *output, const size_t &block_size,
|
|
|
|
const S *indices, const T *update, T *output, int *has_error, const size_t &block_size,
|
|
|
|
const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
|
|
|
|
const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
|
|
|
|
const size_t &indices_dim_1, S *indices_stride, S *work_shape, uint32_t device_id,
|
|
|
|
const size_t &indices_dim_1, S *indices_stride, S *work_shape, uint32_t device_id,
|
|
|
|
cudaStream_t stream) {
|
|
|
|
cudaStream_t stream) {
|
|
|
|
switch (func_type) {
|
|
|
|
switch (func_type) {
|
|
|
|
case TENSOR_SCATTER_FUNC_UPDATE:
|
|
|
|
case TENSOR_SCATTER_FUNC_UPDATE:
|
|
|
|
return TensorScatterUpdateKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
return TensorScatterUpdateKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
indices_stride, work_shape);
|
|
|
|
indices_stride, work_shape);
|
|
|
|
case TENSOR_SCATTER_FUNC_MIN:
|
|
|
|
case TENSOR_SCATTER_FUNC_MIN:
|
|
|
|
return TensorScatterMinKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
return TensorScatterMinKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
indices_stride, work_shape);
|
|
|
|
indices_stride, work_shape);
|
|
|
|
case TENSOR_SCATTER_FUNC_MAX:
|
|
|
|
case TENSOR_SCATTER_FUNC_MAX:
|
|
|
|
return TensorScatterMaxKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
return TensorScatterMaxKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
indices_stride, work_shape);
|
|
|
|
indices_stride, work_shape);
|
|
|
|
case TENSOR_SCATTER_FUNC_ADD:
|
|
|
|
case TENSOR_SCATTER_FUNC_ADD:
|
|
|
|
return TensorScatterAddKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
return TensorScatterAddKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
indices_stride, work_shape);
|
|
|
|
indices_stride, work_shape);
|
|
|
|
case TENSOR_SCATTER_FUNC_SUB:
|
|
|
|
case TENSOR_SCATTER_FUNC_SUB:
|
|
|
|
return TensorScatterSubKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
return TensorScatterSubKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
indices_stride, work_shape);
|
|
|
|
indices_stride, work_shape);
|
|
|
|
case TENSOR_SCATTER_FUNC_MUL:
|
|
|
|
case TENSOR_SCATTER_FUNC_MUL:
|
|
|
|
return TensorScatterMulKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
return TensorScatterMulKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
indices_stride, work_shape);
|
|
|
|
indices_stride, work_shape);
|
|
|
|
case TENSOR_SCATTER_FUNC_DIV:
|
|
|
|
case TENSOR_SCATTER_FUNC_DIV:
|
|
|
|
return TensorScatterDivKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
return TensorScatterDivKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
indices_stride, work_shape);
|
|
|
|
indices_stride, work_shape);
|
|
|
|
default:
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
break;
|
|
|
@ -254,175 +268,178 @@ void TensorScatterArithmetic(const enum TensorScatterArithmeticFunctionType &fun
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
template <typename T, typename S>
|
|
|
|
void CallTensorScatterUpdate(const T *input, const S *indices, const T *update, T *output, const size_t &block_size,
|
|
|
|
void CallTensorScatterUpdate(const T *input, const S *indices, const T *update, T *output, int *has_error,
|
|
|
|
const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
|
|
|
|
const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const size_t &indices_dim_1, S *indices_stride, S *work_shape, uint32_t device_id,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride, S *work_shape,
|
|
|
|
cudaStream_t stream) {
|
|
|
|
uint32_t device_id, cudaStream_t stream) {
|
|
|
|
TensorScatterUpdateKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
TensorScatterUpdateKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
|
|
|
|
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride,
|
|
|
|
input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
|
|
|
|
work_shape);
|
|
|
|
indices_stride, work_shape);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<half, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<half, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const half *input, const int *indices, const half *update,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const half *input, const int *indices, const half *update,
|
|
|
|
half *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
half *output, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
cudaStream_t stream);
|
|
|
|
cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<float, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<float, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const float *input, const int *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const float *input, const int *indices,
|
|
|
|
const float *update, float *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const float *update, float *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
cudaStream_t stream);
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<double, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<double, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const double *input, const int *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const double *input, const int *indices,
|
|
|
|
const double *update, double *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const double *update, double *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
cudaStream_t stream);
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<char, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<char, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const char *input, const int *indices, const char *update,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const char *input, const int *indices, const char *update,
|
|
|
|
char *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
char *output, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
cudaStream_t stream);
|
|
|
|
cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<unsigned char, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<unsigned char, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const unsigned char *input, const int *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const unsigned char *input, const int *indices,
|
|
|
|
const unsigned char *update, unsigned char *output, const size_t &block_size, const size_t &input_size,
|
|
|
|
const unsigned char *update, unsigned char *output, int *has_error, const size_t &block_size,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
int *indices_stride, int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int16_t, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int16_t, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int16_t *input, const int *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int16_t *input, const int *indices,
|
|
|
|
const int16_t *update, int16_t *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const int16_t *update, int16_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
cudaStream_t stream);
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint16_t, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint16_t, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint16_t *input, const int *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint16_t *input, const int *indices,
|
|
|
|
const uint16_t *update, uint16_t *output, const size_t &block_size, const size_t &input_size,
|
|
|
|
const uint16_t *update, uint16_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int *input, const int *indices, const int *update,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int *input, const int *indices, const int *update,
|
|
|
|
int *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
int *output, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
cudaStream_t stream);
|
|
|
|
cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint32_t, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint32_t, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint32_t *input, const int *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint32_t *input, const int *indices,
|
|
|
|
const uint32_t *update, uint32_t *output, const size_t &block_size, const size_t &input_size,
|
|
|
|
const uint32_t *update, uint32_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int64_t, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int64_t, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int64_t *input, const int *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int64_t *input, const int *indices,
|
|
|
|
const int64_t *update, int64_t *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const int64_t *update, int64_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
cudaStream_t stream);
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint64_t, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint64_t, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint64_t *input, const int *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint64_t *input, const int *indices,
|
|
|
|
const uint64_t *update, uint64_t *output, const size_t &block_size, const size_t &input_size,
|
|
|
|
const uint64_t *update, uint64_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<bool, int>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<bool, int>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const bool *input, const int *indices, const bool *update,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const bool *input, const int *indices, const bool *update,
|
|
|
|
bool *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
bool *output, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
cudaStream_t stream);
|
|
|
|
cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<half, int64_t>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<half, int64_t>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const half *input, const int64_t *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const half *input, const int64_t *indices,
|
|
|
|
const half *update, half *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const half *update, half *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
|
|
|
|
|
|
|
|
uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<float, int64_t>(
|
|
|
|
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const float *input, const int64_t *indices,
|
|
|
|
|
|
|
|
const float *update, float *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
|
|
|
|
|
|
|
|
uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<double, int64_t>(
|
|
|
|
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const double *input, const int64_t *indices,
|
|
|
|
|
|
|
|
const double *update, double *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
|
|
|
|
|
|
|
|
uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<char, int64_t>(
|
|
|
|
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const char *input, const int64_t *indices,
|
|
|
|
|
|
|
|
const char *update, char *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
|
|
|
|
|
|
|
|
uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<unsigned char, int64_t>(
|
|
|
|
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const unsigned char *input, const int64_t *indices,
|
|
|
|
|
|
|
|
const unsigned char *update, unsigned char *output, const size_t &block_size, const size_t &input_size,
|
|
|
|
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<float, int64_t>(
|
|
|
|
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const float *input, const int64_t *indices,
|
|
|
|
|
|
|
|
const float *update, float *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<double, int64_t>(
|
|
|
|
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const double *input, const int64_t *indices,
|
|
|
|
|
|
|
|
const double *update, double *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<char, int64_t>(
|
|
|
|
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const char *input, const int64_t *indices,
|
|
|
|
|
|
|
|
const char *update, char *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<unsigned char, int64_t>(
|
|
|
|
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const unsigned char *input, const int64_t *indices,
|
|
|
|
|
|
|
|
const unsigned char *update, unsigned char *output, int *has_error, const size_t &block_size,
|
|
|
|
|
|
|
|
const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
|
|
|
|
|
|
|
|
int64_t *indices_stride, int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int16_t, int64_t>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int16_t, int64_t>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int16_t *input, const int64_t *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int16_t *input, const int64_t *indices,
|
|
|
|
const int16_t *update, int16_t *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const int16_t *update, int16_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
uint32_t device_id, cudaStream_t stream);
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint16_t, int64_t>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint16_t, int64_t>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint16_t *input, const int64_t *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint16_t *input, const int64_t *indices,
|
|
|
|
const uint16_t *update, uint16_t *output, const size_t &block_size, const size_t &input_size,
|
|
|
|
const uint16_t *update, uint16_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int, int64_t>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int, int64_t>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int *input, const int64_t *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int *input, const int64_t *indices,
|
|
|
|
const int *update, int *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const int *update, int *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
uint32_t device_id, cudaStream_t stream);
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint32_t, int64_t>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint32_t, int64_t>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint32_t *input, const int64_t *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint32_t *input, const int64_t *indices,
|
|
|
|
const uint32_t *update, uint32_t *output, const size_t &block_size, const size_t &input_size,
|
|
|
|
const uint32_t *update, uint32_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int64_t, int64_t>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int64_t, int64_t>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int64_t *input, const int64_t *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const int64_t *input, const int64_t *indices,
|
|
|
|
const int64_t *update, int64_t *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const int64_t *update, int64_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
uint32_t device_id, cudaStream_t stream);
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint64_t, int64_t>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint64_t, int64_t>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint64_t *input, const int64_t *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const uint64_t *input, const int64_t *indices,
|
|
|
|
const uint64_t *update, uint64_t *output, const size_t &block_size, const size_t &input_size,
|
|
|
|
const uint64_t *update, uint64_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<bool, int64_t>(
|
|
|
|
template CUDA_LIB_EXPORT void TensorScatterArithmetic<bool, int64_t>(
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const bool *input, const int64_t *indices,
|
|
|
|
const enum TensorScatterArithmeticFunctionType &func_type, const bool *input, const int64_t *indices,
|
|
|
|
const bool *update, bool *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const bool *update, bool *output, int *has_error, const size_t &block_size, const size_t &input_size,
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
|
|
|
|
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
|
|
|
|
uint32_t device_id, cudaStream_t stream);
|
|
|
|
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<float>, int64_t>(
|
|
|
|
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<float>, int64_t>(
|
|
|
|
const Complex<float> *input, const int64_t *indices, const Complex<float> *update, Complex<float> *output,
|
|
|
|
const Complex<float> *input, const int64_t *indices, const Complex<float> *update, Complex<float> *output,
|
|
|
|
const size_t &block_size, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
|
|
|
|
int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
|
|
|
|
|
|
|
|
uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<float>, int>(
|
|
|
|
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<float>, int>(
|
|
|
|
const Complex<float> *input, const int *indices, const Complex<float> *update, Complex<float> *output,
|
|
|
|
const Complex<float> *input, const int *indices, const Complex<float> *update, Complex<float> *output, int *has_error,
|
|
|
|
const size_t &block_size, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
|
|
|
|
const size_t &block_size, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
|
|
|
|
const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<double>, int64_t>(
|
|
|
|
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<double>, int64_t>(
|
|
|
|
const Complex<double> *input, const int64_t *indices, const Complex<double> *update, Complex<double> *output,
|
|
|
|
const Complex<double> *input, const int64_t *indices, const Complex<double> *update, Complex<double> *output,
|
|
|
|
const size_t &block_size, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
|
|
|
|
int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
|
|
|
|
|
|
|
|
uint32_t device_id, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<double>, int>(
|
|
|
|
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<double>, int>(
|
|
|
|
const Complex<double> *input, const int *indices, const Complex<double> *update, Complex<double> *output,
|
|
|
|
const Complex<double> *input, const int *indices, const Complex<double> *update, Complex<double> *output,
|
|
|
|
const size_t &block_size, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
|
|
|
|
int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
|
|
|
|
const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, cudaStream_t stream);
|
|
|
|
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
|
|
|
|
|
|
|
|
cudaStream_t stream);
|
|
|
|