!48621 update tensor_scatter_XXX indices error detect for faster runtime & update docs

Merge pull request !48621 from Yanzhi_YI/tensor_scatter_min
This commit is contained in:
i-robot 2023-02-13 01:35:56 +00:00 committed by Gitee
commit bc68e19e11
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
14 changed files with 169 additions and 107 deletions

View File

@ -20,4 +20,5 @@
异常: 异常:
- **TypeError** - `indices` 的数据类型既不是int32也不是int64。 - **TypeError** - `indices` 的数据类型既不是int32也不是int64。
- **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。 - **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。
- **ValueError** - `input_x` 的值与输入 `indices` 不匹配。 - **ValueError** - `input_x` 的值与输入 `indices` 不匹配。
- **RuntimeError** - `indices` 超出了 `input_x` 的索引范围。

View File

@ -21,3 +21,4 @@
异常: 异常:
- **TypeError** - `indices` 的数据类型既不是int32也不是int64。 - **TypeError** - `indices` 的数据类型既不是int32也不是int64。
- **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。 - **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。
- **RuntimeError** - `indices` 超出了 `input_x` 的索引范围。

View File

@ -22,3 +22,4 @@ mindspore.ops.tensor_scatter_div
异常: 异常:
- **TypeError** - `indices` 的数据类型既不是int32也不是int64。 - **TypeError** - `indices` 的数据类型既不是int32也不是int64。
- **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。 - **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。
- **RuntimeError** - `indices` 超出了 `input_x` 的索引范围。

View File

@ -21,3 +21,4 @@
异常: 异常:
- **TypeError** - `indices` 的数据类型既不是int32也不是int64。 - **TypeError** - `indices` 的数据类型既不是int32也不是int64。
- **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。 - **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。
- **RuntimeError** - `indices` 超出了 `input_x` 的索引范围。

View File

@ -21,3 +21,4 @@
异常: 异常:
- **TypeError** - `indices` 的数据类型既不是int32也不是int64。 - **TypeError** - `indices` 的数据类型既不是int32也不是int64。
- **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。 - **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。
- **RuntimeError** - `indices` 超出了 `input_x` 的索引范围。

View File

@ -21,3 +21,4 @@ mindspore.ops.tensor_scatter_mul
异常: 异常:
- **TypeError** - `indices` 的数据类型既不是int32也不是int64。 - **TypeError** - `indices` 的数据类型既不是int32也不是int64。
- **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。 - **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。
- **RuntimeError** - `indices` 超出了 `input_x` 的索引范围。

View File

@ -21,3 +21,4 @@
异常: 异常:
- **TypeError** - `indices` 的数据类型既不是int32也不是int64。 - **TypeError** - `indices` 的数据类型既不是int32也不是int64。
- **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。 - **ValueError** - `input_x` 的shape长度小于 `indices` 的shape的最后一个维度。
- **RuntimeError** - `indices` 超出了 `input_x` 的索引范围。

View File

@ -190,7 +190,19 @@ template <typename T>
using Complex = mindspore::utils::Complex<T>; using Complex = mindspore::utils::Complex<T>;
template <typename S> template <typename S>
void TensorScatterArithmeticGpuKernelMod::CheckIndicesValid(S *indices) { void TensorScatterArithmeticGpuKernelMod::CheckIndicesValid(int *has_error, S *indices) {
// detect errors
int *has_error_host = reinterpret_cast<int *>(malloc(sizeof(int)));
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
cudaMemcpyAsync(has_error_host, has_error, sizeof(int), cudaMemcpyDeviceToHost,
reinterpret_cast<cudaStream_t>(stream_ptr_)),
"TensorScatterArithmeticGpuKernelMod cudaMemcpy failed in TensorScatterArithmeticGpuKernelMod::CheckIndicesValid.");
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr_)),
"cudaStreamSynchronized failed");
if (has_error_host[0] != 1) {
return;
}
size_t total_indices_num = size_t total_indices_num =
std::accumulate(indices_shape_.begin(), indices_shape_.end(), 1, std::multiplies<size_t>()); std::accumulate(indices_shape_.begin(), indices_shape_.end(), 1, std::multiplies<size_t>());
size_t total_indices_bytes = total_indices_num * indices_unit_size_; size_t total_indices_bytes = total_indices_num * indices_unit_size_;
@ -242,7 +254,16 @@ bool TensorScatterArithmeticGpuKernelMod::LaunchKernel(const std::vector<Address
T *update = GetDeviceAddress<T>(inputs, kIndex2); T *update = GetDeviceAddress<T>(inputs, kIndex2);
T *output = GetDeviceAddress<T>(outputs, kIndex0); T *output = GetDeviceAddress<T>(outputs, kIndex0);
(void)CheckIndicesValid(indices); // set a flag to detect errors
int has_error_host[1] = {0};
int *has_error = nullptr;
cudaMalloc(reinterpret_cast<void **>(&has_error), sizeof(int));
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
cudaMemcpyAsync(has_error, has_error_host, sizeof(int), cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr_)),
"TensorScatterArithmeticGpuKernelMod cudaMemcpy failed in TensorScatterArithmeticGpuKernelMod::LaunchKernel.");
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr_)),
"cudaStreamSynchronized failed");
if (!memcpy_flag_) { if (!memcpy_flag_) {
const size_t indices_len = indices_unit_size_ * vec_indices_stride_.size(); const size_t indices_len = indices_unit_size_ * vec_indices_stride_.size();
@ -268,17 +289,20 @@ bool TensorScatterArithmeticGpuKernelMod::LaunchKernel(const std::vector<Address
if constexpr ((std::is_same_v<T, Complex<float>>) || (std::is_same_v<T, Complex<double>>)) { if constexpr ((std::is_same_v<T, Complex<float>>) || (std::is_same_v<T, Complex<double>>)) {
if (kernel_name_ == kTensorScatterUpdate) { if (kernel_name_ == kTensorScatterUpdate) {
CallTensorScatterUpdate(input, indices, update, output, block_size_, update_size_, output_size_, indices_dim_0_, CallTensorScatterUpdate(input, indices, update, output, has_error, block_size_, update_size_, output_size_,
indices_dim_1_, reinterpret_cast<S *>(indices_stride_), indices_dim_0_, indices_dim_1_, reinterpret_cast<S *>(indices_stride_),
reinterpret_cast<S *>(work_shape_), device_id_, reinterpret_cast<S *>(work_shape_), device_id_,
reinterpret_cast<cudaStream_t>(stream_ptr_)); reinterpret_cast<cudaStream_t>(stream_ptr_));
(void)CheckIndicesValid(has_error, indices);
return true; return true;
} }
} else { } else {
TensorScatterArithmetic(op_func_type_, input, indices, update, output, block_size_, update_size_, output_size_, TensorScatterArithmetic(op_func_type_, input, indices, update, output, has_error, block_size_, update_size_,
indices_dim_0_, indices_dim_1_, reinterpret_cast<S *>(indices_stride_), output_size_, indices_dim_0_, indices_dim_1_, reinterpret_cast<S *>(indices_stride_),
reinterpret_cast<S *>(work_shape_), device_id_, reinterpret_cast<S *>(work_shape_), device_id_,
reinterpret_cast<cudaStream_t>(stream_ptr_)); reinterpret_cast<cudaStream_t>(stream_ptr_));
(void)CheckIndicesValid(has_error, indices);
} }
return true; return true;
} }

View File

@ -58,7 +58,7 @@ class TensorScatterArithmeticGpuKernelMod : public NativeGpuKernelMod,
bool GetOpType(const BaseOperatorPtr &base_operator); bool GetOpType(const BaseOperatorPtr &base_operator);
void UpdateSize(); void UpdateSize();
template <typename S> template <typename S>
void CheckIndicesValid(S *indices); void CheckIndicesValid(int *has_error, S *indices);
template <typename T, typename S> template <typename T, typename S>
bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs); const std::vector<AddressPtr> &outputs);

View File

@ -18,7 +18,7 @@
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"
template <typename T, typename S> template <typename T, typename S>
__global__ void TensorScatterUpdateKernel(const T *input, const S *indices, const T *update, T *output, __global__ void TensorScatterUpdateKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
const size_t block_size, const size_t input_size, const size_t output_size, const size_t block_size, const size_t input_size, const size_t output_size,
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride, const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
S *work_shape) { S *work_shape) {
@ -41,12 +41,14 @@ __global__ void TensorScatterUpdateKernel(const T *input, const S *indices, cons
out_bound |= write_index >= output_size; out_bound |= write_index >= output_size;
if (!out_bound) { if (!out_bound) {
output[write_index] = update[read_index]; output[write_index] = update[read_index];
} else {
has_error[0] = true;
} }
} }
} }
template <typename T, typename S> template <typename T, typename S>
__global__ void TensorScatterMinKernel(const T *input, const S *indices, const T *update, T *output, __global__ void TensorScatterMinKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
const size_t block_size, const size_t input_size, const size_t output_size, const size_t block_size, const size_t input_size, const size_t output_size,
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride, const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
S *work_shape) { S *work_shape) {
@ -69,12 +71,14 @@ __global__ void TensorScatterMinKernel(const T *input, const S *indices, const T
out_bound |= write_index >= output_size; out_bound |= write_index >= output_size;
if (!out_bound) { if (!out_bound) {
(void)MsAtomicMin(&output[write_index], update[read_index]); (void)MsAtomicMin(&output[write_index], update[read_index]);
} else {
has_error[0] = true;
} }
} }
} }
template <typename T, typename S> template <typename T, typename S>
__global__ void TensorScatterMaxKernel(const T *input, const S *indices, const T *update, T *output, __global__ void TensorScatterMaxKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
const size_t block_size, const size_t input_size, const size_t output_size, const size_t block_size, const size_t input_size, const size_t output_size,
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride, const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
S *work_shape) { S *work_shape) {
@ -97,12 +101,14 @@ __global__ void TensorScatterMaxKernel(const T *input, const S *indices, const T
out_bound |= write_index >= output_size; out_bound |= write_index >= output_size;
if (!out_bound) { if (!out_bound) {
(void)MsAtomicMax(&output[write_index], update[read_index]); (void)MsAtomicMax(&output[write_index], update[read_index]);
} else {
has_error[0] = true;
} }
} }
} }
template <typename T, typename S> template <typename T, typename S>
__global__ void TensorScatterAddKernel(const T *input, const S *indices, const T *update, T *output, __global__ void TensorScatterAddKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
const size_t block_size, const size_t input_size, const size_t output_size, const size_t block_size, const size_t input_size, const size_t output_size,
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride, const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
S *work_shape) { S *work_shape) {
@ -125,12 +131,14 @@ __global__ void TensorScatterAddKernel(const T *input, const S *indices, const T
out_bound |= write_index >= output_size; out_bound |= write_index >= output_size;
if (!out_bound) { if (!out_bound) {
(void)MsAtomicAdd(&output[write_index], update[read_index]); (void)MsAtomicAdd(&output[write_index], update[read_index]);
} else {
has_error[0] = true;
} }
} }
} }
template <typename T, typename S> template <typename T, typename S>
__global__ void TensorScatterSubKernel(const T *input, const S *indices, const T *update, T *output, __global__ void TensorScatterSubKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
const size_t block_size, const size_t input_size, const size_t output_size, const size_t block_size, const size_t input_size, const size_t output_size,
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride, const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
S *work_shape) { S *work_shape) {
@ -153,12 +161,14 @@ __global__ void TensorScatterSubKernel(const T *input, const S *indices, const T
out_bound |= write_index >= output_size; out_bound |= write_index >= output_size;
if (!out_bound) { if (!out_bound) {
(void)MsAtomicSub(&output[write_index], update[read_index]); (void)MsAtomicSub(&output[write_index], update[read_index]);
} else {
has_error[0] = true;
} }
} }
} }
template <typename T, typename S> template <typename T, typename S>
__global__ void TensorScatterMulKernel(const T *input, const S *indices, const T *update, T *output, __global__ void TensorScatterMulKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
const size_t block_size, const size_t input_size, const size_t output_size, const size_t block_size, const size_t input_size, const size_t output_size,
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride, const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
S *work_shape) { S *work_shape) {
@ -181,12 +191,14 @@ __global__ void TensorScatterMulKernel(const T *input, const S *indices, const T
out_bound |= write_index >= output_size; out_bound |= write_index >= output_size;
if (!out_bound) { if (!out_bound) {
(void)MsAtomicMul(&output[write_index], update[read_index]); (void)MsAtomicMul(&output[write_index], update[read_index]);
} else {
has_error[0] = true;
} }
} }
} }
template <typename T, typename S> template <typename T, typename S>
__global__ void TensorScatterDivKernel(const T *input, const S *indices, const T *update, T *output, __global__ void TensorScatterDivKernel(const T *input, const S *indices, const T *update, T *output, int *has_error,
const size_t block_size, const size_t input_size, const size_t output_size, const size_t block_size, const size_t input_size, const size_t output_size,
const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride, const size_t indices_dim_0, const size_t indices_dim_1, S *indices_stride,
S *work_shape) { S *work_shape) {
@ -209,44 +221,46 @@ __global__ void TensorScatterDivKernel(const T *input, const S *indices, const T
out_bound |= write_index >= output_size; out_bound |= write_index >= output_size;
if (!out_bound) { if (!out_bound) {
(void)MsAtomicDiv(&output[write_index], update[read_index]); (void)MsAtomicDiv(&output[write_index], update[read_index]);
} else {
has_error[0] = true;
} }
} }
} }
template <typename T, typename S> template <typename T, typename S>
void TensorScatterArithmetic(const enum TensorScatterArithmeticFunctionType &func_type, const T *input, void TensorScatterArithmetic(const enum TensorScatterArithmeticFunctionType &func_type, const T *input,
const S *indices, const T *update, T *output, const size_t &block_size, const S *indices, const T *update, T *output, int *has_error, const size_t &block_size,
const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
const size_t &indices_dim_1, S *indices_stride, S *work_shape, uint32_t device_id, const size_t &indices_dim_1, S *indices_stride, S *work_shape, uint32_t device_id,
cudaStream_t stream) { cudaStream_t stream) {
switch (func_type) { switch (func_type) {
case TENSOR_SCATTER_FUNC_UPDATE: case TENSOR_SCATTER_FUNC_UPDATE:
return TensorScatterUpdateKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>( return TensorScatterUpdateKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
indices_stride, work_shape); indices_stride, work_shape);
case TENSOR_SCATTER_FUNC_MIN: case TENSOR_SCATTER_FUNC_MIN:
return TensorScatterMinKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>( return TensorScatterMinKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
indices_stride, work_shape); indices_stride, work_shape);
case TENSOR_SCATTER_FUNC_MAX: case TENSOR_SCATTER_FUNC_MAX:
return TensorScatterMaxKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>( return TensorScatterMaxKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
indices_stride, work_shape); indices_stride, work_shape);
case TENSOR_SCATTER_FUNC_ADD: case TENSOR_SCATTER_FUNC_ADD:
return TensorScatterAddKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>( return TensorScatterAddKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
indices_stride, work_shape); indices_stride, work_shape);
case TENSOR_SCATTER_FUNC_SUB: case TENSOR_SCATTER_FUNC_SUB:
return TensorScatterSubKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>( return TensorScatterSubKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
indices_stride, work_shape); indices_stride, work_shape);
case TENSOR_SCATTER_FUNC_MUL: case TENSOR_SCATTER_FUNC_MUL:
return TensorScatterMulKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>( return TensorScatterMulKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
indices_stride, work_shape); indices_stride, work_shape);
case TENSOR_SCATTER_FUNC_DIV: case TENSOR_SCATTER_FUNC_DIV:
return TensorScatterDivKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>( return TensorScatterDivKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
indices_stride, work_shape); indices_stride, work_shape);
default: default:
break; break;
@ -254,175 +268,178 @@ void TensorScatterArithmetic(const enum TensorScatterArithmeticFunctionType &fun
} }
template <typename T, typename S> template <typename T, typename S>
void CallTensorScatterUpdate(const T *input, const S *indices, const T *update, T *output, const size_t &block_size, void CallTensorScatterUpdate(const T *input, const S *indices, const T *update, T *output, int *has_error,
const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_1, S *indices_stride, S *work_shape, uint32_t device_id, const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride, S *work_shape,
cudaStream_t stream) { uint32_t device_id, cudaStream_t stream) {
TensorScatterUpdateKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>( TensorScatterUpdateKernel<<<CUDA_BLOCKS(device_id, output_size), CUDA_THREADS(device_id), 0, stream>>>(
input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, input, indices, update, output, has_error, block_size, input_size, output_size, indices_dim_0, indices_dim_1,
work_shape); indices_stride, work_shape);
} }
template CUDA_LIB_EXPORT void TensorScatterArithmetic<half, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<half, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const half *input, const int *indices, const half *update, const enum TensorScatterArithmeticFunctionType &func_type, const half *input, const int *indices, const half *update,
half *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, half *output, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
cudaStream_t stream); cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<float, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<float, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const float *input, const int *indices, const enum TensorScatterArithmeticFunctionType &func_type, const float *input, const int *indices,
const float *update, float *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, const float *update, float *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
cudaStream_t stream); int *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<double, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<double, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const double *input, const int *indices, const enum TensorScatterArithmeticFunctionType &func_type, const double *input, const int *indices,
const double *update, double *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, const double *update, double *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
cudaStream_t stream); int *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<char, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<char, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const char *input, const int *indices, const char *update, const enum TensorScatterArithmeticFunctionType &func_type, const char *input, const int *indices, const char *update,
char *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, char *output, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
cudaStream_t stream); cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<unsigned char, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<unsigned char, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const unsigned char *input, const int *indices, const enum TensorScatterArithmeticFunctionType &func_type, const unsigned char *input, const int *indices,
const unsigned char *update, unsigned char *output, const size_t &block_size, const size_t &input_size, const unsigned char *update, unsigned char *output, int *has_error, const size_t &block_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
int *work_shape, uint32_t device_id, cudaStream_t stream); int *indices_stride, int *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int16_t, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<int16_t, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const int16_t *input, const int *indices, const enum TensorScatterArithmeticFunctionType &func_type, const int16_t *input, const int *indices,
const int16_t *update, int16_t *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, const int16_t *update, int16_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
cudaStream_t stream); int *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint16_t, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint16_t, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const uint16_t *input, const int *indices, const enum TensorScatterArithmeticFunctionType &func_type, const uint16_t *input, const int *indices,
const uint16_t *update, uint16_t *output, const size_t &block_size, const size_t &input_size, const uint16_t *update, uint16_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
int *work_shape, uint32_t device_id, cudaStream_t stream); int *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<int, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const int *input, const int *indices, const int *update, const enum TensorScatterArithmeticFunctionType &func_type, const int *input, const int *indices, const int *update,
int *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, int *output, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
cudaStream_t stream); cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint32_t, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint32_t, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const uint32_t *input, const int *indices, const enum TensorScatterArithmeticFunctionType &func_type, const uint32_t *input, const int *indices,
const uint32_t *update, uint32_t *output, const size_t &block_size, const size_t &input_size, const uint32_t *update, uint32_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
int *work_shape, uint32_t device_id, cudaStream_t stream); int *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int64_t, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<int64_t, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const int64_t *input, const int *indices, const enum TensorScatterArithmeticFunctionType &func_type, const int64_t *input, const int *indices,
const int64_t *update, int64_t *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, const int64_t *update, int64_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
cudaStream_t stream); int *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint64_t, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint64_t, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const uint64_t *input, const int *indices, const enum TensorScatterArithmeticFunctionType &func_type, const uint64_t *input, const int *indices,
const uint64_t *update, uint64_t *output, const size_t &block_size, const size_t &input_size, const uint64_t *update, uint64_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride,
int *work_shape, uint32_t device_id, cudaStream_t stream); int *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<bool, int>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<bool, int>(
const enum TensorScatterArithmeticFunctionType &func_type, const bool *input, const int *indices, const bool *update, const enum TensorScatterArithmeticFunctionType &func_type, const bool *input, const int *indices, const bool *update,
bool *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, bool *output, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
cudaStream_t stream); cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<half, int64_t>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<half, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const half *input, const int64_t *indices, const enum TensorScatterArithmeticFunctionType &func_type, const half *input, const int64_t *indices,
const half *update, half *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, const half *update, half *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<float, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const float *input, const int64_t *indices,
const float *update, float *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<double, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const double *input, const int64_t *indices,
const double *update, double *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<char, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const char *input, const int64_t *indices,
const char *update, char *output, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<unsigned char, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const unsigned char *input, const int64_t *indices,
const unsigned char *update, unsigned char *output, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
int64_t *work_shape, uint32_t device_id, cudaStream_t stream); int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<float, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const float *input, const int64_t *indices,
const float *update, float *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<double, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const double *input, const int64_t *indices,
const double *update, double *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<char, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const char *input, const int64_t *indices,
const char *update, char *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<unsigned char, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const unsigned char *input, const int64_t *indices,
const unsigned char *update, unsigned char *output, int *has_error, const size_t &block_size,
const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1,
int64_t *indices_stride, int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int16_t, int64_t>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<int16_t, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const int16_t *input, const int64_t *indices, const enum TensorScatterArithmeticFunctionType &func_type, const int16_t *input, const int64_t *indices,
const int16_t *update, int16_t *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, const int16_t *update, int16_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
uint32_t device_id, cudaStream_t stream); int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint16_t, int64_t>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint16_t, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const uint16_t *input, const int64_t *indices, const enum TensorScatterArithmeticFunctionType &func_type, const uint16_t *input, const int64_t *indices,
const uint16_t *update, uint16_t *output, const size_t &block_size, const size_t &input_size, const uint16_t *update, uint16_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
int64_t *work_shape, uint32_t device_id, cudaStream_t stream); int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int, int64_t>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<int, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const int *input, const int64_t *indices, const enum TensorScatterArithmeticFunctionType &func_type, const int *input, const int64_t *indices,
const int *update, int *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, const int *update, int *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
uint32_t device_id, cudaStream_t stream); int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint32_t, int64_t>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint32_t, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const uint32_t *input, const int64_t *indices, const enum TensorScatterArithmeticFunctionType &func_type, const uint32_t *input, const int64_t *indices,
const uint32_t *update, uint32_t *output, const size_t &block_size, const size_t &input_size, const uint32_t *update, uint32_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
int64_t *work_shape, uint32_t device_id, cudaStream_t stream); int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<int64_t, int64_t>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<int64_t, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const int64_t *input, const int64_t *indices, const enum TensorScatterArithmeticFunctionType &func_type, const int64_t *input, const int64_t *indices,
const int64_t *update, int64_t *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, const int64_t *update, int64_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
uint32_t device_id, cudaStream_t stream); int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint64_t, int64_t>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<uint64_t, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const uint64_t *input, const int64_t *indices, const enum TensorScatterArithmeticFunctionType &func_type, const uint64_t *input, const int64_t *indices,
const uint64_t *update, uint64_t *output, const size_t &block_size, const size_t &input_size, const uint64_t *update, uint64_t *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
int64_t *work_shape, uint32_t device_id, cudaStream_t stream); int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void TensorScatterArithmetic<bool, int64_t>( template CUDA_LIB_EXPORT void TensorScatterArithmetic<bool, int64_t>(
const enum TensorScatterArithmeticFunctionType &func_type, const bool *input, const int64_t *indices, const enum TensorScatterArithmeticFunctionType &func_type, const bool *input, const int64_t *indices,
const bool *update, bool *output, const size_t &block_size, const size_t &input_size, const size_t &output_size, const bool *update, bool *output, int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride,
uint32_t device_id, cudaStream_t stream); int64_t *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<float>, int64_t>( template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<float>, int64_t>(
const Complex<float> *input, const int64_t *indices, const Complex<float> *update, Complex<float> *output, const Complex<float> *input, const int64_t *indices, const Complex<float> *update, Complex<float> *output,
const size_t &block_size, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, uint32_t device_id, cudaStream_t stream); const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<float>, int>( template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<float>, int>(
const Complex<float> *input, const int *indices, const Complex<float> *update, Complex<float> *output, const Complex<float> *input, const int *indices, const Complex<float> *update, Complex<float> *output, int *has_error,
const size_t &block_size, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, const size_t &block_size, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, cudaStream_t stream); const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<double>, int64_t>( template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<double>, int64_t>(
const Complex<double> *input, const int64_t *indices, const Complex<double> *update, Complex<double> *output, const Complex<double> *input, const int64_t *indices, const Complex<double> *update, Complex<double> *output,
const size_t &block_size, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, uint32_t device_id, cudaStream_t stream); const size_t &indices_dim_0, const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape,
uint32_t device_id, cudaStream_t stream);
template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<double>, int>( template CUDA_LIB_EXPORT void CallTensorScatterUpdate<Complex<double>, int>(
const Complex<double> *input, const int *indices, const Complex<double> *update, Complex<double> *output, const Complex<double> *input, const int *indices, const Complex<double> *update, Complex<double> *output,
const size_t &block_size, const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, int *has_error, const size_t &block_size, const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id, cudaStream_t stream); const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, int *work_shape, uint32_t device_id,
cudaStream_t stream);

View File

@ -31,14 +31,15 @@ enum TensorScatterArithmeticFunctionType {
template <typename T, typename S> template <typename T, typename S>
CUDA_LIB_EXPORT void TensorScatterArithmetic(const enum TensorScatterArithmeticFunctionType &func_type, const T *input, CUDA_LIB_EXPORT void TensorScatterArithmetic(const enum TensorScatterArithmeticFunctionType &func_type, const T *input,
const S *indices, const T *update, T *output, const size_t &block_size, const S *indices, const T *update, T *output, int *has_error,
const size_t &input_size, const size_t &output_size,
const size_t &indices_dim_0, const size_t &indices_dim_1,
S *indices_stride, S *work_shape, uint32_t device_id, cudaStream_t stream);
template <typename T, typename S>
CUDA_LIB_EXPORT void CallTensorScatterUpdate(const T *input, const S *indices, const T *update, T *output,
const size_t &block_size, const size_t &input_size, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0, const size_t &output_size, const size_t &indices_dim_0,
const size_t &indices_dim_1, S *indices_stride, S *work_shape, const size_t &indices_dim_1, S *indices_stride, S *work_shape,
uint32_t device_id, cudaStream_t stream); uint32_t device_id, cudaStream_t stream);
template <typename T, typename S>
CUDA_LIB_EXPORT void CallTensorScatterUpdate(const T *input, const S *indices, const T *update, T *output,
int *has_error, const size_t &block_size, const size_t &input_size,
const size_t &output_size, const size_t &indices_dim_0,
const size_t &indices_dim_1, S *indices_stride, S *work_shape,
uint32_t device_id, cudaStream_t stream);
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_ARITHMETIC_CUH_ #endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_ARITHMETIC_CUH_

View File

@ -101,12 +101,18 @@ int TensorScatterAddPlugin::RunCudaTensorScatterAdd(const nvinfer1::PluginTensor
cudaMalloc(&input_shape_dptr, input_dims.nbDims * sizeof(int)); cudaMalloc(&input_shape_dptr, input_dims.nbDims * sizeof(int));
cudaMemcpy(input_shape_dptr, input_dims.d, input_dims.nbDims * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(input_shape_dptr, input_dims.d, input_dims.nbDims * sizeof(int), cudaMemcpyHostToDevice);
// a flag for error detect
int flag_host[1] = {0};
int *flag = nullptr;
cudaMalloc(reinterpret_cast<void **>(&flag), sizeof(int));
cudaMemcpy(flag, flag_host, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(outputs[0], inputs[0], input_num * sizeof(float), cudaMemcpyDeviceToDevice); cudaMemcpy(outputs[0], inputs[0], input_num * sizeof(float), cudaMemcpyDeviceToDevice);
TensorScatterArithmetic(TensorScatterArithmeticFunctionType::TENSOR_SCATTER_FUNC_ADD, TensorScatterArithmetic(TensorScatterArithmeticFunctionType::TENSOR_SCATTER_FUNC_ADD,
static_cast<const float *>(inputs[0]), static_cast<const int *>(inputs[1]), static_cast<const float *>(inputs[0]), static_cast<const int *>(inputs[1]),
static_cast<const float *>(inputs[INPUT_SIZE2]), static_cast<float *>(outputs[0]), block_size, static_cast<const float *>(inputs[INPUT_SIZE2]), static_cast<float *>(outputs[0]), flag,
update_num, input_num, indice_dim_0, indice_dim_1, indice_stride_dptr, input_shape_dptr, block_size, update_num, input_num, indice_dim_0, indice_dim_1, indice_stride_dptr,
device_id_, stream); input_shape_dptr, device_id_, stream);
cudaFree(indice_stride_dptr); cudaFree(indice_stride_dptr);
cudaFree(input_shape_dptr); cudaFree(input_shape_dptr);

View File

@ -3333,6 +3333,7 @@ def tensor_scatter_add(input_x, indices, updates):
Raises: Raises:
TypeError: If dtype of `indices` is neither int32 nor int64. TypeError: If dtype of `indices` is neither int32 nor int64.
ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`. ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`.
RuntimeError: If a value of `indices` is not in `input_x`.
Supported Platforms: Supported Platforms:
``Ascend`` ``GPU`` ``CPU`` ``Ascend`` ``GPU`` ``CPU``
@ -3385,6 +3386,7 @@ def tensor_scatter_sub(input_x, indices, updates):
Raises: Raises:
TypeError: If dtype of `indices` is neither int32 nor int64. TypeError: If dtype of `indices` is neither int32 nor int64.
ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`. ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`.
RuntimeError: If a value of `indices` is not in `input_x`.
Supported Platforms: Supported Platforms:
``Ascend`` ``GPU`` ``CPU`` ``Ascend`` ``GPU`` ``CPU``
@ -3432,6 +3434,7 @@ def tensor_scatter_max(input_x, indices, updates):
Raises: Raises:
TypeError: If dtype of `indices` is neither int32 nor int64. TypeError: If dtype of `indices` is neither int32 nor int64.
ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`. ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`.
RuntimeError: If a value of `indices` is not in `input_x`.
Supported Platforms: Supported Platforms:
``GPU`` ``CPU`` ``GPU`` ``CPU``
@ -3483,6 +3486,7 @@ def tensor_scatter_min(input_x, indices, updates):
Raises: Raises:
TypeError: If dtype of `indices` is neither int32 nor int64. TypeError: If dtype of `indices` is neither int32 nor int64.
ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`. ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`.
RuntimeError: If a value of `indices` is not in `input_x`.
Supported Platforms: Supported Platforms:
``Ascend`` ``GPU`` ``CPU`` ``Ascend`` ``GPU`` ``CPU``
@ -4590,6 +4594,7 @@ def tensor_scatter_mul(input_x, indices, updates):
Raises: Raises:
TypeError: If dtype of `indices` is neither int32 nor int64. TypeError: If dtype of `indices` is neither int32 nor int64.
ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`. ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`.
RuntimeError: If a value of `indices` is not in `input_x`.
Supported Platforms: Supported Platforms:
``GPU`` ``CPU`` ``GPU`` ``CPU``
@ -4645,6 +4650,7 @@ def tensor_scatter_div(input_x, indices, updates):
Raises: Raises:
TypeError: If dtype of `indices` is neither int32 nor int64. TypeError: If dtype of `indices` is neither int32 nor int64.
ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`. ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`.
RuntimeError: If a value of `indices` is not in `input_x`.
Supported Platforms: Supported Platforms:
``GPU`` ``CPU`` ``GPU`` ``CPU``

View File

@ -6253,6 +6253,7 @@ class TensorScatterUpdate(_TensorScatterOp):
TypeError: If dtype of `indices` is neither int32 nor int64. TypeError: If dtype of `indices` is neither int32 nor int64.
ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`. ValueError: If length of shape of `input_x` is less than the last dimension of shape of `indices`.
ValueError: If the value of `input_x` are not match with input `indices`. ValueError: If the value of `input_x` are not match with input `indices`.
RuntimeError: If a value of `indices` is not in `input_x`.
Supported Platforms: Supported Platforms:
``Ascend`` ``GPU`` ``CPU`` ``Ascend`` ``GPU`` ``CPU``