[feat][assistant][I5EWLH] add new gpu operator LuUnpackGrad
This commit is contained in:
parent
5d292df0bb
commit
1caf3bbbfc
|
@ -0,0 +1,284 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "lu_unpack_grad_impl.cuh"
|
||||
|
||||
template <typename T>
|
||||
__global__ void TrilExpendWidth(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
|
||||
const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width) {
|
||||
for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
|
||||
int64_t matrix_size = lu_data_height * lu_data_width;
|
||||
int64_t idx_height = pos % matrix_size / lu_data_width;
|
||||
int64_t idx_width = pos % matrix_size % lu_data_width;
|
||||
if (idx_width >= idx_height) {
|
||||
l_grad_output[pos] = 0;
|
||||
} else {
|
||||
int64_t in_pos = ((pos / matrix_size) * matrix_L_height + idx_height) * matrix_L_width + idx_width;
|
||||
l_grad_output[pos] = l_grad_input[in_pos];
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void TrilLower(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
|
||||
const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width) {
|
||||
for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
|
||||
int64_t matrix_size = lu_data_height * lu_data_width;
|
||||
int64_t idx_height = pos % matrix_size / lu_data_width;
|
||||
int64_t idx_width = pos % matrix_size % lu_data_width;
|
||||
if (idx_width >= idx_height) {
|
||||
l_grad_output[pos] = 0;
|
||||
} else {
|
||||
l_grad_output[pos] = l_grad_input[pos];
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void TriuExpendHeight(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
|
||||
const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width) {
|
||||
for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
|
||||
int64_t matrix_size = lu_data_height * lu_data_width;
|
||||
int64_t idx_height = pos % matrix_size / lu_data_width;
|
||||
int64_t idx_width = pos % matrix_size % lu_data_width;
|
||||
if (idx_width >= idx_height) {
|
||||
int64_t in_pos = ((pos / matrix_size) * matrix_U_height + idx_height) * matrix_U_width + idx_width;
|
||||
u_grad_output[pos] = u_grad_input[in_pos];
|
||||
} else {
|
||||
u_grad_output[pos] = 0;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void TriuUpper(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
|
||||
const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width) {
|
||||
for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
|
||||
int64_t matrix_size = lu_data_height * lu_data_width;
|
||||
int64_t idx_height = pos % matrix_size / lu_data_width;
|
||||
int64_t idx_width = pos % matrix_size % lu_data_width;
|
||||
if (idx_height <= idx_width) {
|
||||
u_grad_output[pos] = u_grad_input[pos];
|
||||
} else {
|
||||
u_grad_output[pos] = 0;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CalTrilExpendWidth(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
|
||||
const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
TrilExpendWidth<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
|
||||
size, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output, lu_data_height, lu_data_width);
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CalTrilLower(const int64_t size, T *l_grad_input, const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
T *l_grad_output, const int64_t lu_data_height, const int64_t lu_data_width,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
TrilLower<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
|
||||
size, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output, lu_data_height, lu_data_width);
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CalTriuExpendHeight(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
|
||||
const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
TriuExpendHeight<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
|
||||
size, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output, lu_data_height, lu_data_width);
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CalTriuUpper(const int64_t size, T *u_grad_input, const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
T *u_grad_output, const int64_t lu_data_height, const int64_t lu_data_width,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
TriuUpper<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
|
||||
size, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output, lu_data_height, lu_data_width);
|
||||
return;
|
||||
}
|
||||
|
||||
template CUDA_LIB_EXPORT void CalTrilExpendWidth<double>(const int64_t size, double *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
double *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilExpendWidth<float>(const int64_t size, float *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
float *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilExpendWidth<half>(const int64_t size, half *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
half *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilExpendWidth<int64_t>(const int64_t size, int64_t *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
int64_t *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilExpendWidth<int32_t>(const int64_t size, int32_t *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
int32_t *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilExpendWidth<int16_t>(const int64_t size, int16_t *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
int16_t *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilExpendWidth<int8_t>(const int64_t size, int8_t *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
int8_t *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilExpendWidth<uint8_t>(const int64_t size, uint8_t *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
uint8_t *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
|
||||
template CUDA_LIB_EXPORT void CalTrilLower<double>(const int64_t size, double *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
double *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilLower<float>(const int64_t size, float *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
float *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilLower<half>(const int64_t size, half *l_grad_input, const int64_t matrix_L_height,
|
||||
const int64_t matrix_L_width, half *l_grad_output,
|
||||
const int64_t lu_data_height, const int64_t lu_data_width,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilLower<int64_t>(const int64_t size, int64_t *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
int64_t *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilLower<int32_t>(const int64_t size, int32_t *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
int32_t *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilLower<int16_t>(const int64_t size, int16_t *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
int16_t *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilLower<int8_t>(const int64_t size, int8_t *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
int8_t *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilLower<uint8_t>(const int64_t size, uint8_t *l_grad_input,
|
||||
const int64_t matrix_L_height, const int64_t matrix_L_width,
|
||||
uint8_t *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
|
||||
template CUDA_LIB_EXPORT void CalTriuExpendHeight<double>(const int64_t size, double *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
double *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuExpendHeight<float>(const int64_t size, float *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
float *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuExpendHeight<half>(const int64_t size, half *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
half *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuExpendHeight<int64_t>(const int64_t size, int64_t *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
int64_t *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuExpendHeight<int32_t>(const int64_t size, int32_t *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
int32_t *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuExpendHeight<int16_t>(const int64_t size, int16_t *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
int16_t *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuExpendHeight<int8_t>(const int64_t size, int8_t *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
int8_t *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuExpendHeight<uint8_t>(const int64_t size, uint8_t *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
uint8_t *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
|
||||
template CUDA_LIB_EXPORT void CalTriuUpper<double>(const int64_t size, double *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
double *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuUpper<float>(const int64_t size, float *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
float *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuUpper<half>(const int64_t size, half *u_grad_input, const int64_t matrix_U_height,
|
||||
const int64_t matrix_U_width, half *u_grad_output,
|
||||
const int64_t lu_data_height, const int64_t lu_data_width,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuUpper<int64_t>(const int64_t size, int64_t *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
int64_t *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuUpper<int32_t>(const int64_t size, int32_t *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
int32_t *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuUpper<int16_t>(const int64_t size, int16_t *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
int16_t *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuUpper<int8_t>(const int64_t size, int8_t *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
int8_t *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuUpper<uint8_t>(const int64_t size, uint8_t *u_grad_input,
|
||||
const int64_t matrix_U_height, const int64_t matrix_U_width,
|
||||
uint8_t *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LU_UNPACK_GRAD_IMPL_CUH_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LU_UNPACK_GRAD_IMPL_CUH_
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void CalTrilExpendWidth(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
|
||||
const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void CalTrilLower(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
|
||||
const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void CalTriuExpendHeight(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
|
||||
const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void CalTriuUpper(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
|
||||
const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
|
||||
const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LU_UNPACK_GRAD_IMPL_CUH_
|
|
@ -0,0 +1,369 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_impl.cuh"
|
||||
|
||||
template <typename S>
|
||||
__global__ void InitOrder(const size_t p_size, const int64_t lu_data_dim1, S *final_order) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < p_size * lu_data_dim1; pos += gridDim.x * blockDim.x) {
|
||||
final_order[pos] = pos % lu_data_dim1;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
__global__ void SwapOrder(const size_t p_size, const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
|
||||
S *lu_pivots_ptr, S *final_order) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < p_size; pos += gridDim.x * blockDim.x) {
|
||||
S *lu_pivots_working_ptr = lu_pivots_ptr + pos * lu_pivots_dim;
|
||||
for (S idx = 0; idx < lu_pivots_dim; ++idx) {
|
||||
S perm_pivots_id = lu_pivots_working_ptr[idx] - 1;
|
||||
S tmp = final_order[pos * lu_data_dim1 + idx];
|
||||
final_order[pos * lu_data_dim1 + idx] = final_order[pos * lu_data_dim1 + perm_pivots_id];
|
||||
final_order[pos * lu_data_dim1 + perm_pivots_id] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
__global__ void AssignEyeValue(const size_t p_size, const size_t lu_data_dim1, S *final_order, T *pivots_ptr) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < p_size * lu_data_dim1; pos += gridDim.x * blockDim.x) {
|
||||
size_t row_start = (pos / lu_data_dim1 * lu_data_dim1 + final_order[pos]) * lu_data_dim1;
|
||||
size_t eye_idx = pos % lu_data_dim1;
|
||||
*(pivots_ptr + row_start + eye_idx) = static_cast<T>(1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
void TransposeEyeMatrix(S *lu_pivots_ptr, T *pivots_ptr, S *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1, const int64_t lu_pivots_dim, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream) {
|
||||
cudaMemset(pivots_ptr, 0, batch_num * lu_data_dim1 * lu_data_dim1 * sizeof(T));
|
||||
InitOrder<<<CUDA_BLOCKS(device_id, batch_num * lu_data_dim1), CUDA_THREADS(device_id), 0, cuda_stream>>>(
|
||||
batch_num, lu_data_dim1, final_order);
|
||||
SwapOrder<<<CUDA_BLOCKS(device_id, batch_num), CUDA_THREADS(device_id), 0, cuda_stream>>>(
|
||||
batch_num, lu_data_dim1, lu_pivots_dim, lu_pivots_ptr, final_order);
|
||||
AssignEyeValue<<<CUDA_BLOCKS(device_id, batch_num * lu_data_dim1), CUDA_THREADS(device_id), 0, cuda_stream>>>(
|
||||
batch_num, lu_data_dim1, final_order, pivots_ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void TriuAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
|
||||
const int64_t lu_row, const int64_t lu_col, T *output) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
|
||||
int64_t out_matrix_size = out_row * out_col;
|
||||
int64_t in_matrix_size = lu_row * lu_col;
|
||||
int64_t idx_row = pos % out_matrix_size / out_col;
|
||||
int64_t idx_col = pos % out_matrix_size % out_col;
|
||||
|
||||
if (idx_row <= idx_col) {
|
||||
int64_t batch_size = pos / out_matrix_size;
|
||||
int64_t in_pos = batch_size * in_matrix_size + idx_row * lu_col + idx_col;
|
||||
output[pos] = input[in_pos];
|
||||
} else {
|
||||
output[pos] = static_cast<T>(0.0);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CalTriuAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col, const int64_t lu_row,
|
||||
const int64_t lu_col, T *output, const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
TriuAux<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(size, input, out_row, out_col,
|
||||
lu_row, lu_col, output);
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void TrilAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
|
||||
const int64_t lu_row, const int64_t lu_col, T *output) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
|
||||
int64_t out_matrix_size = out_row * out_col;
|
||||
int64_t in_matrix_size = lu_row * lu_col;
|
||||
int64_t idx_row = pos % out_matrix_size / out_col;
|
||||
int64_t idx_col = pos % out_matrix_size % out_col;
|
||||
|
||||
if (idx_row == idx_col) {
|
||||
output[pos] = static_cast<T>(1.0);
|
||||
} else if (idx_row > idx_col) {
|
||||
int64_t batch_size = pos / out_matrix_size;
|
||||
int64_t in_pos = batch_size * in_matrix_size + idx_row * lu_col + idx_col;
|
||||
output[pos] = input[in_pos];
|
||||
} else {
|
||||
output[pos] = static_cast<T>(0.0);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CalTrilAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col, const int64_t lu_row,
|
||||
const int64_t lu_col, T *output, const uint32_t &device_id, cudaStream_t cuda_stream) {
|
||||
TrilAux<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(size, input, out_row, out_col,
|
||||
lu_row, lu_col, output);
|
||||
return;
|
||||
}
|
||||
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int64_t>(int64_t *lu_pivots_ptr, double *pivots_ptr,
|
||||
int64_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int32_t>(int32_t *lu_pivots_ptr, double *pivots_ptr,
|
||||
int32_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int16_t>(int16_t *lu_pivots_ptr, double *pivots_ptr,
|
||||
int16_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int8_t>(int8_t *lu_pivots_ptr, double *pivots_ptr,
|
||||
int8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, uint8_t>(uint8_t *lu_pivots_ptr, double *pivots_ptr,
|
||||
uint8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int64_t>(int64_t *lu_pivots_ptr, float *pivots_ptr,
|
||||
int64_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int32_t>(int32_t *lu_pivots_ptr, float *pivots_ptr,
|
||||
int32_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int16_t>(int16_t *lu_pivots_ptr, float *pivots_ptr,
|
||||
int16_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int8_t>(int8_t *lu_pivots_ptr, float *pivots_ptr,
|
||||
int8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, uint8_t>(uint8_t *lu_pivots_ptr, float *pivots_ptr,
|
||||
uint8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int64_t>(int64_t *lu_pivots_ptr, half *pivots_ptr,
|
||||
int64_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int32_t>(int32_t *lu_pivots_ptr, half *pivots_ptr,
|
||||
int32_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int16_t>(int16_t *lu_pivots_ptr, half *pivots_ptr,
|
||||
int16_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int8_t>(int8_t *lu_pivots_ptr, half *pivots_ptr,
|
||||
int8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, uint8_t>(uint8_t *lu_pivots_ptr, half *pivots_ptr,
|
||||
uint8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int64_t>(int64_t *lu_pivots_ptr, int64_t *pivots_ptr,
|
||||
int64_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int32_t>(int32_t *lu_pivots_ptr, int64_t *pivots_ptr,
|
||||
int32_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int16_t>(int16_t *lu_pivots_ptr, int64_t *pivots_ptr,
|
||||
int16_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int8_t>(int8_t *lu_pivots_ptr, int64_t *pivots_ptr,
|
||||
int8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, uint8_t>(uint8_t *lu_pivots_ptr, int64_t *pivots_ptr,
|
||||
uint8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int64_t>(int64_t *lu_pivots_ptr, int32_t *pivots_ptr,
|
||||
int64_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int32_t>(int32_t *lu_pivots_ptr, int32_t *pivots_ptr,
|
||||
int32_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int16_t>(int16_t *lu_pivots_ptr, int32_t *pivots_ptr,
|
||||
int16_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int8_t>(int8_t *lu_pivots_ptr, int32_t *pivots_ptr,
|
||||
int8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, uint8_t>(uint8_t *lu_pivots_ptr, int32_t *pivots_ptr,
|
||||
uint8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int64_t>(int64_t *lu_pivots_ptr, int16_t *pivots_ptr,
|
||||
int64_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int32_t>(int32_t *lu_pivots_ptr, int16_t *pivots_ptr,
|
||||
int32_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int16_t>(int16_t *lu_pivots_ptr, int16_t *pivots_ptr,
|
||||
int16_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int8_t>(int8_t *lu_pivots_ptr, int16_t *pivots_ptr,
|
||||
int8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, uint8_t>(uint8_t *lu_pivots_ptr, int16_t *pivots_ptr,
|
||||
uint8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int64_t>(int64_t *lu_pivots_ptr, int8_t *pivots_ptr,
|
||||
int64_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int32_t>(int32_t *lu_pivots_ptr, int8_t *pivots_ptr,
|
||||
int32_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int16_t>(int16_t *lu_pivots_ptr, int8_t *pivots_ptr,
|
||||
int16_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int8_t>(int8_t *lu_pivots_ptr, int8_t *pivots_ptr,
|
||||
int8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, uint8_t>(uint8_t *lu_pivots_ptr, int8_t *pivots_ptr,
|
||||
uint8_t *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1,
|
||||
const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<uint8_t>(const size_t size, const uint8_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
uint8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<uint16_t>(const size_t size, const uint16_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
uint16_t *output, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<uint32_t>(const size_t size, const uint32_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
uint32_t *output, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<uint64_t>(const size_t size, const uint64_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
uint64_t *output, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<int8_t>(const size_t size, const int8_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
int8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<int16_t>(const size_t size, const int16_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
int16_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<int>(const size_t size, const int *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
int *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<int64_t>(const size_t size, const int64_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
int64_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<half>(const size_t size, const half *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
half *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<float>(const size_t size, const float *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
float *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<double>(const size_t size, const double *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
double *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTriuAux<bool>(const size_t size, const bool *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
bool *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<uint8_t>(const size_t size, const uint8_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
uint8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<uint16_t>(const size_t size, const uint16_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
uint16_t *output, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<uint32_t>(const size_t size, const uint32_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
uint32_t *output, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<uint64_t>(const size_t size, const uint64_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
uint64_t *output, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<int8_t>(const size_t size, const int8_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
int8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<int16_t>(const size_t size, const int16_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
int16_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<int>(const size_t size, const int *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
int *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<int64_t>(const size_t size, const int64_t *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
int64_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<half>(const size_t size, const half *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
half *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<float>(const size_t size, const float *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
float *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<double>(const size_t size, const double *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
double *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
template CUDA_LIB_EXPORT void CalTrilAux<bool>(const size_t size, const bool *input, const int64_t out_row,
|
||||
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
|
||||
bool *output, const uint32_t &device_id, cudaStream_t cuda_stream);
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LUUNPACK_IMPL_CUH_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LUUNPACK_IMPL_CUH_
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
|
||||
|
||||
template <typename T, typename S>
|
||||
CUDA_LIB_EXPORT void TransposeEyeMatrix(S *lu_pivots_ptr, T *pivots_ptr, S *final_order, const int64_t batch_num,
|
||||
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
|
||||
const uint32_t &device_id, cudaStream_t cuda_stream);
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void CalTrilAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
|
||||
const int64_t lu_row, const int64_t lu_col, T *output, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
|
||||
template <typename T>
|
||||
CUDA_LIB_EXPORT void CalTriuAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
|
||||
const int64_t lu_row, const int64_t lu_col, T *output, const uint32_t &device_id,
|
||||
cudaStream_t cuda_stream);
|
||||
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LUUNPACK_IMPL_CUH_
|
|
@ -0,0 +1,455 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "plugin/device/gpu/kernel/math/lu_unpack_gpu_kernel.h"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr size_t kFirstDim = 1;
|
||||
constexpr size_t kSecondDim = 2;
|
||||
} // namespace
|
||||
|
||||
void LuUnpackGpuKernelMod::ResetResource() noexcept {
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
workspace_size_list_.clear();
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
bool LuUnpackGpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
T *lu_data_ptr = GetDeviceAddress<T>(inputs, kIndex0);
|
||||
MS_EXCEPTION_IF_NULL(lu_data_ptr);
|
||||
S *lu_pivots_ptr = GetDeviceAddress<S>(inputs, kIndex1);
|
||||
MS_EXCEPTION_IF_NULL(lu_pivots_ptr);
|
||||
|
||||
T *pivots_ptr = GetDeviceAddress<T>(outputs, kIndex0);
|
||||
T *l_ptr = GetDeviceAddress<T>(outputs, kIndex1);
|
||||
T *u_ptr = GetDeviceAddress<T>(outputs, kIndex2);
|
||||
S *final_order = GetDeviceAddress<S>(workspace, kIndex0);
|
||||
|
||||
// triu and tril
|
||||
if (lu_data_dim1_ > lu_data_dim2_) {
|
||||
CalTriuAux(batch_num_ * u_stride_, lu_data_ptr, lu_data_dim2_, lu_data_dim2_, lu_data_dim1_, lu_data_dim2_, u_ptr,
|
||||
device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
|
||||
CalTrilAux(batch_num_ * l_stride_, lu_data_ptr, lu_data_dim1_, lu_data_dim2_, lu_data_dim1_, lu_data_dim2_, l_ptr,
|
||||
device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
|
||||
} else {
|
||||
CalTriuAux(batch_num_ * u_stride_, lu_data_ptr, lu_data_dim1_, lu_data_dim2_, lu_data_dim1_, lu_data_dim2_, u_ptr,
|
||||
device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
|
||||
CalTrilAux(batch_num_ * l_stride_, lu_data_ptr, lu_data_dim1_, lu_data_dim1_, lu_data_dim1_, lu_data_dim2_, l_ptr,
|
||||
device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
|
||||
}
|
||||
|
||||
// swap and index_select
|
||||
TransposeEyeMatrix(lu_pivots_ptr, pivots_ptr, final_order, batch_num_, lu_data_dim1_, lu_pivots_dim_, device_id_,
|
||||
reinterpret_cast<cudaStream_t>(cuda_stream_));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool LuUnpackGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs) {
|
||||
kernel_name_ = base_operator->name();
|
||||
|
||||
if (inputs.empty() || outputs.empty()) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "' got empty inputs or outputs, which is invalid.";
|
||||
return false;
|
||||
}
|
||||
auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
|
||||
auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
|
||||
if (!is_match) {
|
||||
MS_LOG(EXCEPTION) << "LuUnpack does not support this kernel data type: " << kernel_attr << ".";
|
||||
return false;
|
||||
}
|
||||
|
||||
kernel_func_ = func_list_[index].second;
|
||||
unit_size1_ = abstract::TypeIdSize(inputs.at(kIndex0)->GetDtype());
|
||||
unit_size2_ = abstract::TypeIdSize(inputs.at(kIndex1)->GetDtype());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int LuUnpackGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs,
|
||||
const std::map<uint32_t, tensor::TensorPtr> &) {
|
||||
int ret = KernelMod::Resize(base_operator, inputs, outputs);
|
||||
if (ret != KRET_OK) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::vector<int64_t> lu_data_shape = std::vector<int64_t>(inputs.at(kIndex0)->GetDeviceShapeAdaptively().begin(),
|
||||
inputs.at(kIndex0)->GetDeviceShapeAdaptively().end());
|
||||
std::vector<int64_t> lu_pivots_shape = std::vector<int64_t>(inputs.at(kIndex1)->GetDeviceShapeAdaptively().begin(),
|
||||
inputs.at(kIndex1)->GetDeviceShapeAdaptively().end());
|
||||
lu_data_size_ = size_t(std::accumulate(lu_data_shape.begin(), lu_data_shape.end(), 1, std::multiplies<int64_t>()));
|
||||
lu_pivots_size_ =
|
||||
size_t(std::accumulate(lu_pivots_shape.begin(), lu_pivots_shape.end(), 1, std::multiplies<int64_t>()));
|
||||
|
||||
auto lu_data_shape_vec = inputs[kIndex0]->GetShapeVector();
|
||||
auto lu_pivots_shape_vec = inputs[kIndex1]->GetShapeVector();
|
||||
size_t lu_data_dims = lu_data_shape_vec.size();
|
||||
size_t lu_pivots_dims = lu_pivots_shape_vec.size();
|
||||
std::vector<int64_t> lu_data_dims_vector = lu_data_shape_vec;
|
||||
std::vector<int64_t> lu_pivots_dims_vector = lu_pivots_shape_vec;
|
||||
|
||||
lu_data_dim1_ = lu_data_dims_vector[lu_data_dims - kSecondDim];
|
||||
lu_data_dim2_ = lu_data_dims_vector[lu_data_dims - kFirstDim];
|
||||
if (lu_data_dim1_ > lu_data_dim2_) {
|
||||
l_stride_ = lu_data_dim1_ * lu_data_dim2_;
|
||||
u_stride_ = lu_data_dim2_ * lu_data_dim2_;
|
||||
} else {
|
||||
l_stride_ = lu_data_dim1_ * lu_data_dim1_;
|
||||
u_stride_ = lu_data_dim1_ * lu_data_dim2_;
|
||||
}
|
||||
lu_pivots_dim_ = lu_pivots_dims_vector[lu_pivots_dims - kFirstDim];
|
||||
batch_num_ = lu_data_size_ / (lu_data_dim1_ * lu_data_dim2_);
|
||||
workspace_size_list_.push_back(batch_num_ * lu_data_dim1_ * unit_size2_);
|
||||
return KRET_OK;
|
||||
}
|
||||
|
||||
bool LuUnpackGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
cuda_stream_ = stream_ptr;
|
||||
return kernel_func_(this, inputs, workspace, outputs);
|
||||
}
|
||||
|
||||
std::vector<KernelAttr> LuUnpackGpuKernelMod::GetOpSupport() {
|
||||
std::vector<KernelAttr> support_list;
|
||||
(void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
|
||||
[](const std::pair<KernelAttr, LuUnpackFunc> &pair) { return pair.first; });
|
||||
return support_list;
|
||||
}
|
||||
|
||||
std::vector<std::pair<KernelAttr, LuUnpackGpuKernelMod::LuUnpackFunc>> LuUnpackGpuKernelMod::func_list_ = {
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat64)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<double, int64_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat64)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<double, int32_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat64)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<double, int16_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat64)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<double, int8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat64)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<double, uint8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<float, int64_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<float, int32_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<float, int16_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<float, int8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<float, uint8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<half, int64_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<half, int32_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<half, int16_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<half, int8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<half, uint8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int64_t, int64_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int64_t, int32_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int64_t, int16_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int64_t, int8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int64_t, uint8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int64_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int32_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int16_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, uint8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int64_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int32_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int16_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, uint8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int16_t, int64_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int16_t, int32_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int16_t, int16_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int16_t, int8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int16_t, uint8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int64_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int32_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int16_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, uint8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int64_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int32_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int16_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8),
|
||||
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, uint8_t>}};
|
||||
|
||||
MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, LuUnpack, LuUnpackGpuKernelMod);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,78 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LUUNPACK_GPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LUUNPACK_GPU_KERNEL_H_
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include "mindspore/core/ops/lu_unpack.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/factory/ms_factory.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_impl.cuh"
|
||||
#include "plugin/device/gpu/hal/device/gpu_device_address.h"
|
||||
#include "utils/check_convert_utils.h"
|
||||
#include "abstract/ops/primitive_infer_map.h"
|
||||
|
||||
namespace mindspore {
|
||||
constexpr size_t kInputNum = 2;
|
||||
constexpr size_t kOutputNum = 3;
|
||||
namespace kernel {
|
||||
class LuUnpackGpuKernelMod : public NativeGpuKernelMod {
|
||||
public:
|
||||
LuUnpackGpuKernelMod() { ResetResource(); }
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
|
||||
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs) override;
|
||||
int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs, const std::map<uint32_t, tensor::TensorPtr> &) override;
|
||||
|
||||
protected:
|
||||
std::vector<KernelAttr> GetOpSupport() override;
|
||||
|
||||
private:
|
||||
using LuUnpackFunc =
|
||||
std::function<bool(LuUnpackGpuKernelMod *, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
|
||||
LuUnpackFunc kernel_func_;
|
||||
void *cuda_stream_{nullptr};
|
||||
static std::vector<std::pair<KernelAttr, LuUnpackFunc>> func_list_;
|
||||
|
||||
size_t lu_data_size_;
|
||||
size_t lu_pivots_size_;
|
||||
int64_t lu_data_dim1_;
|
||||
int64_t lu_data_dim2_;
|
||||
int64_t l_stride_;
|
||||
int64_t u_stride_;
|
||||
int64_t lu_pivots_dim_;
|
||||
int64_t batch_num_;
|
||||
size_t unit_size1_;
|
||||
size_t unit_size2_;
|
||||
|
||||
void ResetResource() noexcept;
|
||||
template <typename T_data, typename T_pivots>
|
||||
bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs);
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LUUNPACK_GPU_KERNEL_H_
|
|
@ -0,0 +1,213 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "plugin/device/gpu/kernel/math/lu_unpack_grad_gpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr const size_t kLuUnpackGradInputNum = 3;
|
||||
constexpr const size_t kLuUnpackGradOutputNum = 2;
|
||||
constexpr const int64_t Min_Dim = 2;
|
||||
} // namespace
|
||||
|
||||
bool LuUnpackGradGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs) {
|
||||
auto kernel_ptr = std::dynamic_pointer_cast<ops::LuUnpackGrad>(base_operator);
|
||||
MS_ERROR_IF_NULL_W_RET_VAL(kernel_ptr, false);
|
||||
|
||||
kernel_name_ = kernel_ptr->name();
|
||||
if (inputs.size() != kLuUnpackGradInputNum || outputs.size() != kLuUnpackGradOutputNum) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', input and output size must be " << kLuUnpackGradInputNum << " and "
|
||||
<< kLuUnpackGradOutputNum << ", but got " << inputs.size() << " and " << outputs.size();
|
||||
return false;
|
||||
}
|
||||
|
||||
auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
|
||||
auto pair = MatchKernelAttr(kernel_attr, GetOpSupport());
|
||||
if (!pair.first) {
|
||||
MS_LOG(ERROR) << "'" << kernel_name_ << "' does not support this kernel data type: " << kernel_attr;
|
||||
return false;
|
||||
}
|
||||
|
||||
kernel_func_ = func_list_[pair.second].second;
|
||||
unit_size_ = abstract::TypeIdSize(inputs.at(kIndex0)->GetDtype());
|
||||
input_L_shape = std::vector<int64_t>(inputs.at(kIndex0)->GetDeviceShapeAdaptively().begin(),
|
||||
inputs.at(kIndex0)->GetDeviceShapeAdaptively().end());
|
||||
input_U_shape = std::vector<int64_t>(inputs.at(kIndex1)->GetDeviceShapeAdaptively().begin(),
|
||||
inputs.at(kIndex1)->GetDeviceShapeAdaptively().end());
|
||||
input_LU_shape = std::vector<int64_t>(inputs.at(kIndex2)->GetDeviceShapeAdaptively().begin(),
|
||||
inputs.at(kIndex2)->GetDeviceShapeAdaptively().end());
|
||||
return true;
|
||||
}
|
||||
|
||||
int LuUnpackGradGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs,
|
||||
const std::map<uint32_t, tensor::TensorPtr> &others) {
|
||||
int ret = KernelMod::Resize(base_operator, inputs, outputs);
|
||||
if (ret != KRET_OK) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ResetResource();
|
||||
input_L_shape = std::vector<int64_t>(inputs.at(kIndex0)->GetDeviceShapeAdaptively().begin(),
|
||||
inputs.at(kIndex0)->GetDeviceShapeAdaptively().end());
|
||||
input_U_shape = std::vector<int64_t>(inputs.at(kIndex1)->GetDeviceShapeAdaptively().begin(),
|
||||
inputs.at(kIndex1)->GetDeviceShapeAdaptively().end());
|
||||
input_LU_shape = std::vector<int64_t>(inputs.at(kIndex2)->GetDeviceShapeAdaptively().begin(),
|
||||
inputs.at(kIndex2)->GetDeviceShapeAdaptively().end());
|
||||
int64_t input_L_elements_ =
|
||||
std::accumulate(input_L_shape.begin(), input_L_shape.end(), int64_t(1), std::multiplies<int64_t>());
|
||||
int64_t input_U_elements_ =
|
||||
std::accumulate(input_U_shape.begin(), input_U_shape.end(), int64_t(1), std::multiplies<int64_t>());
|
||||
int64_t input_LU_elements_ =
|
||||
std::accumulate(input_LU_shape.begin(), input_LU_shape.end(), int64_t(1), std::multiplies<int64_t>());
|
||||
input_elements_ = input_L_elements_ + input_U_elements_ + input_LU_elements_;
|
||||
int64_t input_L_size = input_L_elements_ * unit_size_;
|
||||
int64_t input_U_size = input_U_elements_ * unit_size_;
|
||||
int64_t input_LU_size = input_LU_elements_ * unit_size_;
|
||||
input_size_list_.emplace_back(input_L_size);
|
||||
input_size_list_.emplace_back(input_U_size);
|
||||
input_size_list_.emplace_back(input_LU_size);
|
||||
output_size_list_.emplace_back(input_LU_size);
|
||||
output_size_list_.emplace_back(input_LU_size);
|
||||
workspace_size_list_.emplace_back(input_L_size);
|
||||
workspace_size_list_.emplace_back(input_U_size);
|
||||
|
||||
return KRET_OK;
|
||||
}
|
||||
|
||||
void LuUnpackGradGpuKernelMod::ResetResource() noexcept {
|
||||
input_elements_ = 0;
|
||||
is_null_input_ = false;
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
workspace_size_list_.clear();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool LuUnpackGradGpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
T *l_grad_input = GetDeviceAddress<T>(inputs, kIndex0);
|
||||
T *u_grad_input = GetDeviceAddress<T>(inputs, kIndex1);
|
||||
T *l_grad_output = GetDeviceAddress<T>(outputs, kIndex0);
|
||||
T *u_grad_output = GetDeviceAddress<T>(outputs, kIndex1);
|
||||
|
||||
int64_t lu_data_dims = input_LU_shape.size();
|
||||
int64_t lu_data_elem_num =
|
||||
std::accumulate(input_LU_shape.begin(), input_LU_shape.end(), int64_t(1), std::multiplies<int64_t>());
|
||||
int64_t lu_data_height = input_LU_shape[lu_data_dims - 2];
|
||||
int64_t lu_data_width = input_LU_shape[lu_data_dims - 1];
|
||||
int64_t lu_data_stride = lu_data_height * lu_data_width;
|
||||
int64_t matrix_num = lu_data_elem_num / lu_data_stride;
|
||||
int64_t output_stride = lu_data_stride;
|
||||
|
||||
int64_t input_L_dims = input_L_shape.size();
|
||||
int64_t input_U_dims = input_U_shape.size();
|
||||
int64_t matrix_L_width = input_L_shape[input_L_dims - 2];
|
||||
int64_t matrix_L_height = input_L_shape[input_L_dims - 1];
|
||||
int64_t matrix_U_width = input_U_shape[input_U_dims - 2];
|
||||
int64_t matrix_U_height = input_U_shape[input_U_dims - 1];
|
||||
|
||||
if (lu_data_width > lu_data_height) {
|
||||
CalTrilExpendWidth(matrix_num * output_stride, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output,
|
||||
lu_data_height, lu_data_width, device_id_, stream_ptr_);
|
||||
CalTriuUpper(matrix_num * output_stride, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output,
|
||||
lu_data_height, lu_data_width, device_id_, stream_ptr_);
|
||||
} else {
|
||||
if (lu_data_height > lu_data_width) {
|
||||
CalTrilLower(matrix_num * output_stride, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output,
|
||||
lu_data_height, lu_data_width, device_id_, stream_ptr_);
|
||||
CalTriuExpendHeight(matrix_num * output_stride, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output,
|
||||
lu_data_height, lu_data_width, device_id_, stream_ptr_);
|
||||
} else {
|
||||
CalTrilLower(matrix_num * output_stride, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output,
|
||||
lu_data_height, lu_data_width, device_id_, stream_ptr_);
|
||||
CalTriuUpper(matrix_num * output_stride, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output,
|
||||
lu_data_height, lu_data_width, device_id_, stream_ptr_);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<std::pair<KernelAttr, LuUnpackGradGpuKernelMod::LuUnpackGradFunc>> LuUnpackGradGpuKernelMod::func_list_ = {
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat64)
|
||||
.AddInputAttr(kNumberTypeFloat64)
|
||||
.AddInputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64)
|
||||
.AddOutputAttr(kNumberTypeFloat64),
|
||||
&LuUnpackGradGpuKernelMod::LaunchKernel<double>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
&LuUnpackGradGpuKernelMod::LaunchKernel<float>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
&LuUnpackGradGpuKernelMod::LaunchKernel<half>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64),
|
||||
&LuUnpackGradGpuKernelMod::LaunchKernel<int64_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
&LuUnpackGradGpuKernelMod::LaunchKernel<int32_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddInputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16)
|
||||
.AddOutputAttr(kNumberTypeInt16),
|
||||
&LuUnpackGradGpuKernelMod::LaunchKernel<int16_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddInputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8)
|
||||
.AddOutputAttr(kNumberTypeInt8),
|
||||
&LuUnpackGradGpuKernelMod::LaunchKernel<int8_t>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddInputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8)
|
||||
.AddOutputAttr(kNumberTypeUInt8),
|
||||
&LuUnpackGradGpuKernelMod::LaunchKernel<uint8_t>}};
|
||||
|
||||
std::vector<KernelAttr> LuUnpackGradGpuKernelMod::GetOpSupport() {
|
||||
std::vector<KernelAttr> support_list;
|
||||
(void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
|
||||
[](const std::pair<KernelAttr, LuUnpackGradFunc> &pair) { return pair.first; });
|
||||
return support_list;
|
||||
}
|
||||
|
||||
MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, LuUnpackGrad, LuUnpackGradGpuKernelMod);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,81 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_UNPACK_GRAD_GPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_UNPACK_GRAD_GPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <functional>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include "mindspore/core/ops/grad/lu_unpack_grad.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/factory/ms_factory.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_device_address.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_grad_impl.cuh"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class LuUnpackGradGpuKernelMod : public NativeGpuKernelMod {
|
||||
public:
|
||||
LuUnpackGradGpuKernelMod() { ResetResource(); }
|
||||
~LuUnpackGradGpuKernelMod() override = default;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
VARIABLE_NOT_USED(workspace);
|
||||
stream_ptr_ = reinterpret_cast<cudaStream_t>(stream_ptr);
|
||||
return kernel_func_(this, inputs, workspace, outputs);
|
||||
}
|
||||
|
||||
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs) override;
|
||||
|
||||
int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs,
|
||||
const std::map<uint32_t, tensor::TensorPtr> &others = std::map<uint32_t, tensor::TensorPtr>()) override;
|
||||
|
||||
std::vector<KernelAttr> GetOpSupport() override;
|
||||
|
||||
void ResetResource() noexcept;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs);
|
||||
|
||||
using LuUnpackGradFunc =
|
||||
std::function<bool(LuUnpackGradGpuKernelMod *, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
|
||||
static std::vector<std::pair<KernelAttr, LuUnpackGradFunc>> func_list_;
|
||||
LuUnpackGradFunc kernel_func_;
|
||||
|
||||
bool is_null_input_{false};
|
||||
int64_t unit_size_{1};
|
||||
int64_t input_elements_;
|
||||
std::vector<int64_t> input_L_shape;
|
||||
std::vector<int64_t> input_U_shape;
|
||||
std::vector<int64_t> input_LU_shape;
|
||||
cudaStream_t stream_ptr_{nullptr};
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_UNPACK_GRAD_GPU_KERNEL_H_
|
|
@ -45,6 +45,7 @@ abstract::TupleShapePtr LuUnpackGradInferShape(const PrimitivePtr &primitive,
|
|||
auto LU_data_dim1 = LU_data_shape[LU_data_shape.size() - 2];
|
||||
auto LU_data_dim2 = LU_data_shape[LU_data_shape.size() - 1];
|
||||
int64_t LU_data_min = std::min(LU_data_dim1, LU_data_dim2);
|
||||
|
||||
if (LU_data_dim1 != L_data_dim1) {
|
||||
MS_EXCEPTION(ValueError) << "For '" << primitive->name()
|
||||
<< "', L_grad's data dim[-2] and LU_data's dim[-2] should be same.";
|
||||
|
|
|
@ -42,6 +42,7 @@ abstract::TupleShapePtr LuUnpackInferShape(const PrimitivePtr &primitive,
|
|||
auto LU_data_shape = LU_data_shape_map[kShape];
|
||||
auto LU_pivots_shape_map = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[1]->BuildShape());
|
||||
auto LU_pivots_shape = LU_pivots_shape_map[kShape];
|
||||
|
||||
if (IsDynamicRank(LU_data_shape)) {
|
||||
auto output_shpe = std::make_shared<abstract::Shape>(LU_data_shape);
|
||||
return std::make_shared<abstract::TupleShape>(
|
||||
|
|
|
@ -6388,7 +6388,7 @@ class LuUnpack(Primitive):
|
|||
RuntimeError: On the Ascend platform, if the value of `LU_pivots` are out of range[1, LU_data.shape[-2]).
|
||||
|
||||
Supported Platforms:
|
||||
``Ascend`` ``CPU``
|
||||
``Ascend`` ``GPU`` ``CPU``
|
||||
|
||||
Examples:
|
||||
>>> LU_data = Tensor(np.array([[[-0.3806, -0.4872, 0.5536],
|
||||
|
@ -6399,11 +6399,32 @@ class LuUnpack(Primitive):
|
|||
... [ 0.1015, -0.5363, 0.6165]]]), mstype.float32)
|
||||
>>> LU_pivots = Tensor(np.array([[1, 3, 3],
|
||||
... [2, 3, 3]]), mstype.int32)
|
||||
>>> lu_unpack = ops.LuUnpack()
|
||||
>>> pivots, L, U = lu_unpack(LU_data, LU_pivots, unpack_data, unpack_pivots)
|
||||
>>> lu_unpack = LuUnpack()
|
||||
>>> pivots, L, U = lu_unpack(LU_data, LU_pivots)
|
||||
>>> print(pivots)
|
||||
[[[1. 0. 0.]
|
||||
[0. 0. 1.]
|
||||
[0. 1. 0.]]
|
||||
<BLANKLINE>
|
||||
[[0. 0. 1.]
|
||||
[1. 0. 0.]
|
||||
[0. 1. 0.]]]
|
||||
>>> print(L)
|
||||
[[[ 1. 0. 0. ]
|
||||
[-0.1287 1. 0. ]
|
||||
[ 0.2583 0.5239 1. ]]
|
||||
<BLANKLINE>
|
||||
[[ 1. 0. 0. ]
|
||||
[-0.6401 1. 0. ]
|
||||
[ 0.1015 -0.5363 1. ]]]
|
||||
>>> print(U)
|
||||
[[[-0.3806 -0.4872 0.5536]
|
||||
[ 0. 0.6508 -0.2396]
|
||||
[ 0. 0. 0.6902]]
|
||||
<BLANKLINE>
|
||||
[[ 0.6706 -1.1782 0.4574]
|
||||
[ 0. -0.4779 0.6701]
|
||||
[ 0. 0. 0.6165]]]
|
||||
"""
|
||||
|
||||
@prim_attr_register
|
||||
|
@ -6491,10 +6512,6 @@ class Polygamma(Primitive):
|
|||
.. math::
|
||||
\psi^{(a)}(x) = \frac{d^{(a)}}{dx^{(a)}} \psi(x)
|
||||
|
||||
Args:
|
||||
:math:`a \geq 0`: the order of the polygamma function.
|
||||
input (Tensor): the tensor to compute the polygamma function.
|
||||
|
||||
Inputs:
|
||||
- **a** (Tensor) - The order of the polygamma function, types: int32, int64.
|
||||
- **x** (Tensor) - The input tensor, types: float16, float32, float64.
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
# Copyright 2022 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import mindspore.nn as nn
|
||||
from mindspore import Tensor
|
||||
from mindspore.ops.operations import _grad_ops as G
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
|
||||
class NetLuUnpackGrad(nn.Cell):
|
||||
def __init__(self, l_grad_flag=True, u_grad_flag=True):
|
||||
super().__init__()
|
||||
self.lu_unpack_grad = G.LuUnpackGrad(L_grad_flag=l_grad_flag, U_grad_flag=u_grad_flag)
|
||||
|
||||
def construct(self, l_grad, u_grad, lu_data):
|
||||
return self.lu_unpack_grad(l_grad, u_grad, lu_data)
|
||||
|
||||
|
||||
def getl(a_lu, pivots):
|
||||
a_lu_float = a_lu.float()
|
||||
a_lu_torch = torch.tensor(a_lu_float, requires_grad=True)
|
||||
out_torch = torch.lu_unpack(a_lu_torch, pivots)
|
||||
out_torch[1].backward(gradient=out_torch[1])
|
||||
return a_lu_torch.grad
|
||||
|
||||
|
||||
def getu(a_lu, pivots):
|
||||
a_lu_float = a_lu.float()
|
||||
a_lu_torch = torch.tensor(a_lu_float, requires_grad=True)
|
||||
out_torch = torch.lu_unpack(a_lu_torch, pivots)
|
||||
out_torch[2].backward(gradient=out_torch[2])
|
||||
return a_lu_torch.grad
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_lu_unpack_grad_graph_float():
|
||||
"""
|
||||
Feature: LuUnpackGrad gpu TEST.
|
||||
Description: 4d - float32 test case for LuUnpackGrad
|
||||
Expectation: the result match to numpy
|
||||
"""
|
||||
a = torch.randn(3, 3, 3, 3)
|
||||
loss = 1e-5
|
||||
a_lu, pivots = a.lu()
|
||||
out = torch.lu_unpack(a_lu, pivots)
|
||||
|
||||
a_l_torch = getl(a_lu, pivots)
|
||||
a_u_torch = getu(a_lu, pivots)
|
||||
|
||||
a_l_mindspore = Tensor(out[1].numpy(), mstype.float32)
|
||||
a_u_mindspore = Tensor(out[2].numpy(), mstype.float32)
|
||||
a_lu_mindspore = Tensor(a_lu.numpy(), mstype.float32)
|
||||
|
||||
net = NetLuUnpackGrad(l_grad_flag=True, u_grad_flag=True)
|
||||
l_grad_mindspore, u_grad_mindspore = net(a_l_mindspore, a_u_mindspore, a_lu_mindspore)
|
||||
|
||||
assert np.allclose(a_l_torch, torch.tensor(l_grad_mindspore.asnumpy()), atol=loss)
|
||||
assert np.allclose(a_u_torch, torch.tensor(u_grad_mindspore.asnumpy()), atol=loss)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_lu_unpack_grad_pynative_float():
|
||||
"""
|
||||
Feature: LuUnpackGrad gpu TEST.
|
||||
Description: 4d - float32 test case for LuUnpackGrad
|
||||
Expectation: the result match to numpy
|
||||
"""
|
||||
a = torch.randn(5, 4, 3, 2)
|
||||
loss = 1e-5
|
||||
a_lu, pivots = a.lu()
|
||||
out = torch.lu_unpack(a_lu, pivots)
|
||||
|
||||
a_l_torch = getl(a_lu, pivots)
|
||||
a_u_torch = getu(a_lu, pivots)
|
||||
|
||||
a_l_mindspore = Tensor(out[1].numpy(), mstype.float32)
|
||||
a_u_mindspore = Tensor(out[2].numpy(), mstype.float32)
|
||||
a_lu_mindspore = Tensor(a_lu.numpy(), mstype.float32)
|
||||
|
||||
net = NetLuUnpackGrad(l_grad_flag=True, u_grad_flag=True)
|
||||
l_grad_mindspore, u_grad_mindspore = net(a_l_mindspore, a_u_mindspore, a_lu_mindspore)
|
||||
|
||||
assert np.allclose(a_l_torch, torch.tensor(l_grad_mindspore.asnumpy()), atol=loss)
|
||||
assert np.allclose(a_u_torch, torch.tensor(u_grad_mindspore.asnumpy()), atol=loss)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_lu_unpack_grad_pynative_float_error():
|
||||
"""
|
||||
Feature: LuUnpackGrad gpu TEST.
|
||||
Description: 4d - uint16 test the unsupported type for LuUnpackGrad
|
||||
Expectation: the result match to numpy
|
||||
"""
|
||||
a = torch.randn(8, 7, 4, 2)
|
||||
a_lu, pivots = a.lu()
|
||||
out = torch.lu_unpack(a_lu, pivots)
|
||||
|
||||
a_l_mindspore = Tensor(out[1].numpy(), mstype.uint16)
|
||||
a_u_mindspore = Tensor(out[2].numpy(), mstype.uint16)
|
||||
a_lu_mindspore = Tensor(a_lu.numpy(), mstype.uint16)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
net = NetLuUnpackGrad(l_grad_flag=True, u_grad_flag=True)
|
||||
net(a_l_mindspore, a_u_mindspore, a_lu_mindspore)
|
|
@ -0,0 +1,113 @@
|
|||
# Copyright 2022 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
import numpy as np
|
||||
import mindspore.nn as nn
|
||||
from mindspore import Tensor
|
||||
from mindspore.ops.operations.math_ops import LuUnpack
|
||||
import mindspore.common.dtype as mstype
|
||||
import torch
|
||||
import pytest
|
||||
|
||||
|
||||
class LuUnpackNet(nn.Cell):
|
||||
def __init__(self, unpack_data=True, unpack_pivots=True):
|
||||
super(LuUnpackNet, self).__init__()
|
||||
self.lu_unpack = LuUnpack(unpack_data=unpack_data, unpack_pivots=unpack_pivots)
|
||||
|
||||
def construct(self, LU_data, LU_pivots):
|
||||
return self.lu_unpack(LU_data, LU_pivots)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_lu_unpack_float32_int64():
|
||||
"""
|
||||
Feature: ALL To ALL
|
||||
Description: test cases for LuUnpack
|
||||
Expectation: the result match to torch
|
||||
"""
|
||||
my_atol = 1e-5
|
||||
data = torch.randn(10, 7, 8, 9)
|
||||
a_lu, pivots = data.lu()
|
||||
p, a_l, a_u = torch.lu_unpack(a_lu, pivots)
|
||||
my_a_lu = Tensor(a_lu.numpy(), mstype.float32)
|
||||
my_pivots = Tensor(pivots.numpy(), mstype.int64)
|
||||
net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
|
||||
my_p, my_a_l, my_a_u = net(my_a_lu, my_pivots)
|
||||
assert np.allclose(p.numpy(), my_p.asnumpy(), atol=my_atol)
|
||||
assert np.allclose(a_l.numpy(), my_a_l.asnumpy(), atol=my_atol)
|
||||
assert np.allclose(a_u.numpy(), my_a_u.asnumpy(), atol=my_atol)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_lu_unpack_float64_int32():
|
||||
"""
|
||||
Feature: ALL To ALL
|
||||
Description: test cases for LuUnpack
|
||||
Expectation: the result match to torch
|
||||
"""
|
||||
my_atol = 1e-5
|
||||
data = torch.randn(7, 8, 9, 10)
|
||||
a_lu, pivots = data.lu()
|
||||
p, a_l, a_u = torch.lu_unpack(a_lu, pivots)
|
||||
my_a_lu = Tensor(a_lu.numpy(), mstype.float64)
|
||||
my_pivots = Tensor(pivots.numpy(), mstype.int32)
|
||||
net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
|
||||
my_p, my_a_l, my_a_u = net(my_a_lu, my_pivots)
|
||||
assert np.allclose(p.numpy(), my_p.asnumpy(), atol=my_atol)
|
||||
assert np.allclose(a_l.numpy(), my_a_l.asnumpy(), atol=my_atol)
|
||||
assert np.allclose(a_u.numpy(), my_a_u.asnumpy(), atol=my_atol)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_lu_unpack_float64_int16():
|
||||
"""
|
||||
Feature: ALL To ALL
|
||||
Description: test cases for LuUnpack
|
||||
Expectation: the result match to torch
|
||||
"""
|
||||
my_atol = 1e-5
|
||||
data = torch.randn(4, 6, 8, 10)
|
||||
a_lu, pivots = data.lu()
|
||||
p, a_l, a_u = torch.lu_unpack(a_lu, pivots)
|
||||
my_a_lu = Tensor(a_lu.numpy(), mstype.float64)
|
||||
my_pivots = Tensor(pivots.numpy(), mstype.int16)
|
||||
net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
|
||||
my_p, my_a_l, my_a_u = net(my_a_lu, my_pivots)
|
||||
assert np.allclose(p.numpy(), my_p.asnumpy(), atol=my_atol)
|
||||
assert np.allclose(a_l.numpy(), my_a_l.asnumpy(), atol=my_atol)
|
||||
assert np.allclose(a_u.numpy(), my_a_u.asnumpy(), atol=my_atol)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_lu_unpack_input_error():
|
||||
"""
|
||||
Feature: ALL To ALL
|
||||
Description: test cases for LuUnpack
|
||||
Expectation: raise ValueError
|
||||
"""
|
||||
my_a_lu = Tensor(np.random.randn(3, 3, 2, 2).astype(np.float32))
|
||||
my_pivots = Tensor(np.random.randn(3, 3, 5, 3).astype(np.int32))
|
||||
with pytest.raises(ValueError):
|
||||
net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
|
||||
net(my_a_lu, my_pivots)
|
Loading…
Reference in New Issue