[feat][assistant][I5EWLH] add new gpu operator LuUnpackGrad

This commit is contained in:
xie-zhongjie 2022-12-01 21:46:11 +08:00
parent 5d292df0bb
commit 1caf3bbbfc
13 changed files with 1822 additions and 7 deletions

View File

@ -0,0 +1,284 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lu_unpack_grad_impl.cuh"
template <typename T>
__global__ void TrilExpendWidth(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width) {
for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
int64_t matrix_size = lu_data_height * lu_data_width;
int64_t idx_height = pos % matrix_size / lu_data_width;
int64_t idx_width = pos % matrix_size % lu_data_width;
if (idx_width >= idx_height) {
l_grad_output[pos] = 0;
} else {
int64_t in_pos = ((pos / matrix_size) * matrix_L_height + idx_height) * matrix_L_width + idx_width;
l_grad_output[pos] = l_grad_input[in_pos];
}
}
return;
}
template <typename T>
__global__ void TrilLower(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width) {
for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
int64_t matrix_size = lu_data_height * lu_data_width;
int64_t idx_height = pos % matrix_size / lu_data_width;
int64_t idx_width = pos % matrix_size % lu_data_width;
if (idx_width >= idx_height) {
l_grad_output[pos] = 0;
} else {
l_grad_output[pos] = l_grad_input[pos];
}
}
return;
}
template <typename T>
__global__ void TriuExpendHeight(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width) {
for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
int64_t matrix_size = lu_data_height * lu_data_width;
int64_t idx_height = pos % matrix_size / lu_data_width;
int64_t idx_width = pos % matrix_size % lu_data_width;
if (idx_width >= idx_height) {
int64_t in_pos = ((pos / matrix_size) * matrix_U_height + idx_height) * matrix_U_width + idx_width;
u_grad_output[pos] = u_grad_input[in_pos];
} else {
u_grad_output[pos] = 0;
}
}
return;
}
template <typename T>
__global__ void TriuUpper(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width) {
for (int64_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
int64_t matrix_size = lu_data_height * lu_data_width;
int64_t idx_height = pos % matrix_size / lu_data_width;
int64_t idx_width = pos % matrix_size % lu_data_width;
if (idx_height <= idx_width) {
u_grad_output[pos] = u_grad_input[pos];
} else {
u_grad_output[pos] = 0;
}
}
return;
}
template <typename T>
void CalTrilExpendWidth(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream) {
TrilExpendWidth<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
size, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output, lu_data_height, lu_data_width);
return;
}
template <typename T>
void CalTrilLower(const int64_t size, T *l_grad_input, const int64_t matrix_L_height, const int64_t matrix_L_width,
T *l_grad_output, const int64_t lu_data_height, const int64_t lu_data_width,
const uint32_t &device_id, cudaStream_t cuda_stream) {
TrilLower<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
size, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output, lu_data_height, lu_data_width);
return;
}
template <typename T>
void CalTriuExpendHeight(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream) {
TriuExpendHeight<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
size, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output, lu_data_height, lu_data_width);
return;
}
template <typename T>
void CalTriuUpper(const int64_t size, T *u_grad_input, const int64_t matrix_U_height, const int64_t matrix_U_width,
T *u_grad_output, const int64_t lu_data_height, const int64_t lu_data_width,
const uint32_t &device_id, cudaStream_t cuda_stream) {
TriuUpper<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
size, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output, lu_data_height, lu_data_width);
return;
}
template CUDA_LIB_EXPORT void CalTrilExpendWidth<double>(const int64_t size, double *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
double *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilExpendWidth<float>(const int64_t size, float *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
float *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilExpendWidth<half>(const int64_t size, half *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
half *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilExpendWidth<int64_t>(const int64_t size, int64_t *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
int64_t *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilExpendWidth<int32_t>(const int64_t size, int32_t *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
int32_t *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilExpendWidth<int16_t>(const int64_t size, int16_t *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
int16_t *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilExpendWidth<int8_t>(const int64_t size, int8_t *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
int8_t *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilExpendWidth<uint8_t>(const int64_t size, uint8_t *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
uint8_t *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilLower<double>(const int64_t size, double *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
double *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilLower<float>(const int64_t size, float *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
float *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilLower<half>(const int64_t size, half *l_grad_input, const int64_t matrix_L_height,
const int64_t matrix_L_width, half *l_grad_output,
const int64_t lu_data_height, const int64_t lu_data_width,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilLower<int64_t>(const int64_t size, int64_t *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
int64_t *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilLower<int32_t>(const int64_t size, int32_t *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
int32_t *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilLower<int16_t>(const int64_t size, int16_t *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
int16_t *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilLower<int8_t>(const int64_t size, int8_t *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
int8_t *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilLower<uint8_t>(const int64_t size, uint8_t *l_grad_input,
const int64_t matrix_L_height, const int64_t matrix_L_width,
uint8_t *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuExpendHeight<double>(const int64_t size, double *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
double *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuExpendHeight<float>(const int64_t size, float *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
float *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuExpendHeight<half>(const int64_t size, half *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
half *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuExpendHeight<int64_t>(const int64_t size, int64_t *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
int64_t *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuExpendHeight<int32_t>(const int64_t size, int32_t *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
int32_t *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuExpendHeight<int16_t>(const int64_t size, int16_t *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
int16_t *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuExpendHeight<int8_t>(const int64_t size, int8_t *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
int8_t *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuExpendHeight<uint8_t>(const int64_t size, uint8_t *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
uint8_t *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuUpper<double>(const int64_t size, double *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
double *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuUpper<float>(const int64_t size, float *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
float *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuUpper<half>(const int64_t size, half *u_grad_input, const int64_t matrix_U_height,
const int64_t matrix_U_width, half *u_grad_output,
const int64_t lu_data_height, const int64_t lu_data_width,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuUpper<int64_t>(const int64_t size, int64_t *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
int64_t *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuUpper<int32_t>(const int64_t size, int32_t *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
int32_t *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuUpper<int16_t>(const int64_t size, int16_t *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
int16_t *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuUpper<int8_t>(const int64_t size, int8_t *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
int8_t *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuUpper<uint8_t>(const int64_t size, uint8_t *u_grad_input,
const int64_t matrix_U_height, const int64_t matrix_U_width,
uint8_t *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);

View File

@ -0,0 +1,42 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LU_UNPACK_GRAD_IMPL_CUH_
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LU_UNPACK_GRAD_IMPL_CUH_
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
template <typename T>
CUDA_LIB_EXPORT void CalTrilExpendWidth(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template <typename T>
CUDA_LIB_EXPORT void CalTrilLower(const int64_t size, T *l_grad_input, const int64_t matrix_L_height,
const int64_t matrix_L_width, T *l_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream);
template <typename T>
CUDA_LIB_EXPORT void CalTriuExpendHeight(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id,
cudaStream_t cuda_stream);
template <typename T>
CUDA_LIB_EXPORT void CalTriuUpper(const int64_t size, T *u_grad_input, const int64_t matrix_U_height,
const int64_t matrix_U_width, T *u_grad_output, const int64_t lu_data_height,
const int64_t lu_data_width, const uint32_t &device_id, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LU_UNPACK_GRAD_IMPL_CUH_

View File

@ -0,0 +1,369 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_impl.cuh"
template <typename S>
__global__ void InitOrder(const size_t p_size, const int64_t lu_data_dim1, S *final_order) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < p_size * lu_data_dim1; pos += gridDim.x * blockDim.x) {
final_order[pos] = pos % lu_data_dim1;
}
}
template <typename S>
__global__ void SwapOrder(const size_t p_size, const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
S *lu_pivots_ptr, S *final_order) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < p_size; pos += gridDim.x * blockDim.x) {
S *lu_pivots_working_ptr = lu_pivots_ptr + pos * lu_pivots_dim;
for (S idx = 0; idx < lu_pivots_dim; ++idx) {
S perm_pivots_id = lu_pivots_working_ptr[idx] - 1;
S tmp = final_order[pos * lu_data_dim1 + idx];
final_order[pos * lu_data_dim1 + idx] = final_order[pos * lu_data_dim1 + perm_pivots_id];
final_order[pos * lu_data_dim1 + perm_pivots_id] = tmp;
}
}
}
template <typename T, typename S>
__global__ void AssignEyeValue(const size_t p_size, const size_t lu_data_dim1, S *final_order, T *pivots_ptr) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < p_size * lu_data_dim1; pos += gridDim.x * blockDim.x) {
size_t row_start = (pos / lu_data_dim1 * lu_data_dim1 + final_order[pos]) * lu_data_dim1;
size_t eye_idx = pos % lu_data_dim1;
*(pivots_ptr + row_start + eye_idx) = static_cast<T>(1);
}
return;
}
template <typename T, typename S>
void TransposeEyeMatrix(S *lu_pivots_ptr, T *pivots_ptr, S *final_order, const int64_t batch_num,
const int64_t lu_data_dim1, const int64_t lu_pivots_dim, const uint32_t &device_id,
cudaStream_t cuda_stream) {
cudaMemset(pivots_ptr, 0, batch_num * lu_data_dim1 * lu_data_dim1 * sizeof(T));
InitOrder<<<CUDA_BLOCKS(device_id, batch_num * lu_data_dim1), CUDA_THREADS(device_id), 0, cuda_stream>>>(
batch_num, lu_data_dim1, final_order);
SwapOrder<<<CUDA_BLOCKS(device_id, batch_num), CUDA_THREADS(device_id), 0, cuda_stream>>>(
batch_num, lu_data_dim1, lu_pivots_dim, lu_pivots_ptr, final_order);
AssignEyeValue<<<CUDA_BLOCKS(device_id, batch_num * lu_data_dim1), CUDA_THREADS(device_id), 0, cuda_stream>>>(
batch_num, lu_data_dim1, final_order, pivots_ptr);
return;
}
template <typename T>
__global__ void TriuAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
const int64_t lu_row, const int64_t lu_col, T *output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
int64_t out_matrix_size = out_row * out_col;
int64_t in_matrix_size = lu_row * lu_col;
int64_t idx_row = pos % out_matrix_size / out_col;
int64_t idx_col = pos % out_matrix_size % out_col;
if (idx_row <= idx_col) {
int64_t batch_size = pos / out_matrix_size;
int64_t in_pos = batch_size * in_matrix_size + idx_row * lu_col + idx_col;
output[pos] = input[in_pos];
} else {
output[pos] = static_cast<T>(0.0);
}
}
return;
}
template <typename T>
void CalTriuAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col, const int64_t lu_row,
const int64_t lu_col, T *output, const uint32_t &device_id, cudaStream_t cuda_stream) {
TriuAux<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(size, input, out_row, out_col,
lu_row, lu_col, output);
return;
}
template <typename T>
__global__ void TrilAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
const int64_t lu_row, const int64_t lu_col, T *output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
int64_t out_matrix_size = out_row * out_col;
int64_t in_matrix_size = lu_row * lu_col;
int64_t idx_row = pos % out_matrix_size / out_col;
int64_t idx_col = pos % out_matrix_size % out_col;
if (idx_row == idx_col) {
output[pos] = static_cast<T>(1.0);
} else if (idx_row > idx_col) {
int64_t batch_size = pos / out_matrix_size;
int64_t in_pos = batch_size * in_matrix_size + idx_row * lu_col + idx_col;
output[pos] = input[in_pos];
} else {
output[pos] = static_cast<T>(0.0);
}
}
return;
}
template <typename T>
void CalTrilAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col, const int64_t lu_row,
const int64_t lu_col, T *output, const uint32_t &device_id, cudaStream_t cuda_stream) {
TrilAux<<<CUDA_BLOCKS(device_id, size), CUDA_THREADS(device_id), 0, cuda_stream>>>(size, input, out_row, out_col,
lu_row, lu_col, output);
return;
}
template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int64_t>(int64_t *lu_pivots_ptr, double *pivots_ptr,
int64_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int32_t>(int32_t *lu_pivots_ptr, double *pivots_ptr,
int32_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int16_t>(int16_t *lu_pivots_ptr, double *pivots_ptr,
int16_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, int8_t>(int8_t *lu_pivots_ptr, double *pivots_ptr,
int8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<double, uint8_t>(uint8_t *lu_pivots_ptr, double *pivots_ptr,
uint8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int64_t>(int64_t *lu_pivots_ptr, float *pivots_ptr,
int64_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int32_t>(int32_t *lu_pivots_ptr, float *pivots_ptr,
int32_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int16_t>(int16_t *lu_pivots_ptr, float *pivots_ptr,
int16_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, int8_t>(int8_t *lu_pivots_ptr, float *pivots_ptr,
int8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<float, uint8_t>(uint8_t *lu_pivots_ptr, float *pivots_ptr,
uint8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int64_t>(int64_t *lu_pivots_ptr, half *pivots_ptr,
int64_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int32_t>(int32_t *lu_pivots_ptr, half *pivots_ptr,
int32_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int16_t>(int16_t *lu_pivots_ptr, half *pivots_ptr,
int16_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, int8_t>(int8_t *lu_pivots_ptr, half *pivots_ptr,
int8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<half, uint8_t>(uint8_t *lu_pivots_ptr, half *pivots_ptr,
uint8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int64_t>(int64_t *lu_pivots_ptr, int64_t *pivots_ptr,
int64_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int32_t>(int32_t *lu_pivots_ptr, int64_t *pivots_ptr,
int32_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int16_t>(int16_t *lu_pivots_ptr, int64_t *pivots_ptr,
int16_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, int8_t>(int8_t *lu_pivots_ptr, int64_t *pivots_ptr,
int8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int64_t, uint8_t>(uint8_t *lu_pivots_ptr, int64_t *pivots_ptr,
uint8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int64_t>(int64_t *lu_pivots_ptr, int32_t *pivots_ptr,
int64_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int32_t>(int32_t *lu_pivots_ptr, int32_t *pivots_ptr,
int32_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int16_t>(int16_t *lu_pivots_ptr, int32_t *pivots_ptr,
int16_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, int8_t>(int8_t *lu_pivots_ptr, int32_t *pivots_ptr,
int8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int32_t, uint8_t>(uint8_t *lu_pivots_ptr, int32_t *pivots_ptr,
uint8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int64_t>(int64_t *lu_pivots_ptr, int16_t *pivots_ptr,
int64_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int32_t>(int32_t *lu_pivots_ptr, int16_t *pivots_ptr,
int32_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int16_t>(int16_t *lu_pivots_ptr, int16_t *pivots_ptr,
int16_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, int8_t>(int8_t *lu_pivots_ptr, int16_t *pivots_ptr,
int8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int16_t, uint8_t>(uint8_t *lu_pivots_ptr, int16_t *pivots_ptr,
uint8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int64_t>(int64_t *lu_pivots_ptr, int8_t *pivots_ptr,
int64_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int32_t>(int32_t *lu_pivots_ptr, int8_t *pivots_ptr,
int32_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int16_t>(int16_t *lu_pivots_ptr, int8_t *pivots_ptr,
int16_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, int8_t>(int8_t *lu_pivots_ptr, int8_t *pivots_ptr,
int8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void TransposeEyeMatrix<int8_t, uint8_t>(uint8_t *lu_pivots_ptr, int8_t *pivots_ptr,
uint8_t *final_order, const int64_t batch_num,
const int64_t lu_data_dim1,
const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<uint8_t>(const size_t size, const uint8_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
uint8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<uint16_t>(const size_t size, const uint16_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
uint16_t *output, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<uint32_t>(const size_t size, const uint32_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
uint32_t *output, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<uint64_t>(const size_t size, const uint64_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
uint64_t *output, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<int8_t>(const size_t size, const int8_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
int8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<int16_t>(const size_t size, const int16_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
int16_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<int>(const size_t size, const int *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
int *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<int64_t>(const size_t size, const int64_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
int64_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<half>(const size_t size, const half *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
half *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<float>(const size_t size, const float *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
float *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<double>(const size_t size, const double *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
double *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTriuAux<bool>(const size_t size, const bool *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
bool *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<uint8_t>(const size_t size, const uint8_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
uint8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<uint16_t>(const size_t size, const uint16_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
uint16_t *output, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<uint32_t>(const size_t size, const uint32_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
uint32_t *output, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<uint64_t>(const size_t size, const uint64_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
uint64_t *output, const uint32_t &device_id,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<int8_t>(const size_t size, const int8_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
int8_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<int16_t>(const size_t size, const int16_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
int16_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<int>(const size_t size, const int *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
int *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<int64_t>(const size_t size, const int64_t *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
int64_t *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<half>(const size_t size, const half *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
half *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<float>(const size_t size, const float *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
float *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<double>(const size_t size, const double *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
double *output, const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalTrilAux<bool>(const size_t size, const bool *input, const int64_t out_row,
const int64_t out_col, const int64_t lu_row, const int64_t lu_col,
bool *output, const uint32_t &device_id, cudaStream_t cuda_stream);

View File

@ -0,0 +1,35 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LUUNPACK_IMPL_CUH_
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LUUNPACK_IMPL_CUH_
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.h"
template <typename T, typename S>
CUDA_LIB_EXPORT void TransposeEyeMatrix(S *lu_pivots_ptr, T *pivots_ptr, S *final_order, const int64_t batch_num,
const int64_t lu_data_dim1, const int64_t lu_pivots_dim,
const uint32_t &device_id, cudaStream_t cuda_stream);
template <typename T>
CUDA_LIB_EXPORT void CalTrilAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
const int64_t lu_row, const int64_t lu_col, T *output, const uint32_t &device_id,
cudaStream_t cuda_stream);
template <typename T>
CUDA_LIB_EXPORT void CalTriuAux(const size_t size, const T *input, const int64_t out_row, const int64_t out_col,
const int64_t lu_row, const int64_t lu_col, T *output, const uint32_t &device_id,
cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LUUNPACK_IMPL_CUH_

View File

@ -0,0 +1,455 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/gpu/kernel/math/lu_unpack_gpu_kernel.h"
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kFirstDim = 1;
constexpr size_t kSecondDim = 2;
} // namespace
void LuUnpackGpuKernelMod::ResetResource() noexcept {
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}
template <typename T, typename S>
bool LuUnpackGpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
T *lu_data_ptr = GetDeviceAddress<T>(inputs, kIndex0);
MS_EXCEPTION_IF_NULL(lu_data_ptr);
S *lu_pivots_ptr = GetDeviceAddress<S>(inputs, kIndex1);
MS_EXCEPTION_IF_NULL(lu_pivots_ptr);
T *pivots_ptr = GetDeviceAddress<T>(outputs, kIndex0);
T *l_ptr = GetDeviceAddress<T>(outputs, kIndex1);
T *u_ptr = GetDeviceAddress<T>(outputs, kIndex2);
S *final_order = GetDeviceAddress<S>(workspace, kIndex0);
// triu and tril
if (lu_data_dim1_ > lu_data_dim2_) {
CalTriuAux(batch_num_ * u_stride_, lu_data_ptr, lu_data_dim2_, lu_data_dim2_, lu_data_dim1_, lu_data_dim2_, u_ptr,
device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
CalTrilAux(batch_num_ * l_stride_, lu_data_ptr, lu_data_dim1_, lu_data_dim2_, lu_data_dim1_, lu_data_dim2_, l_ptr,
device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
} else {
CalTriuAux(batch_num_ * u_stride_, lu_data_ptr, lu_data_dim1_, lu_data_dim2_, lu_data_dim1_, lu_data_dim2_, u_ptr,
device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
CalTrilAux(batch_num_ * l_stride_, lu_data_ptr, lu_data_dim1_, lu_data_dim1_, lu_data_dim1_, lu_data_dim2_, l_ptr,
device_id_, reinterpret_cast<cudaStream_t>(cuda_stream_));
}
// swap and index_select
TransposeEyeMatrix(lu_pivots_ptr, pivots_ptr, final_order, batch_num_, lu_data_dim1_, lu_pivots_dim_, device_id_,
reinterpret_cast<cudaStream_t>(cuda_stream_));
return true;
}
bool LuUnpackGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) {
kernel_name_ = base_operator->name();
if (inputs.empty() || outputs.empty()) {
MS_LOG(ERROR) << "For '" << kernel_name_ << "' got empty inputs or outputs, which is invalid.";
return false;
}
auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
if (!is_match) {
MS_LOG(EXCEPTION) << "LuUnpack does not support this kernel data type: " << kernel_attr << ".";
return false;
}
kernel_func_ = func_list_[index].second;
unit_size1_ = abstract::TypeIdSize(inputs.at(kIndex0)->GetDtype());
unit_size2_ = abstract::TypeIdSize(inputs.at(kIndex1)->GetDtype());
return true;
}
int LuUnpackGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &) {
int ret = KernelMod::Resize(base_operator, inputs, outputs);
if (ret != KRET_OK) {
return ret;
}
std::vector<int64_t> lu_data_shape = std::vector<int64_t>(inputs.at(kIndex0)->GetDeviceShapeAdaptively().begin(),
inputs.at(kIndex0)->GetDeviceShapeAdaptively().end());
std::vector<int64_t> lu_pivots_shape = std::vector<int64_t>(inputs.at(kIndex1)->GetDeviceShapeAdaptively().begin(),
inputs.at(kIndex1)->GetDeviceShapeAdaptively().end());
lu_data_size_ = size_t(std::accumulate(lu_data_shape.begin(), lu_data_shape.end(), 1, std::multiplies<int64_t>()));
lu_pivots_size_ =
size_t(std::accumulate(lu_pivots_shape.begin(), lu_pivots_shape.end(), 1, std::multiplies<int64_t>()));
auto lu_data_shape_vec = inputs[kIndex0]->GetShapeVector();
auto lu_pivots_shape_vec = inputs[kIndex1]->GetShapeVector();
size_t lu_data_dims = lu_data_shape_vec.size();
size_t lu_pivots_dims = lu_pivots_shape_vec.size();
std::vector<int64_t> lu_data_dims_vector = lu_data_shape_vec;
std::vector<int64_t> lu_pivots_dims_vector = lu_pivots_shape_vec;
lu_data_dim1_ = lu_data_dims_vector[lu_data_dims - kSecondDim];
lu_data_dim2_ = lu_data_dims_vector[lu_data_dims - kFirstDim];
if (lu_data_dim1_ > lu_data_dim2_) {
l_stride_ = lu_data_dim1_ * lu_data_dim2_;
u_stride_ = lu_data_dim2_ * lu_data_dim2_;
} else {
l_stride_ = lu_data_dim1_ * lu_data_dim1_;
u_stride_ = lu_data_dim1_ * lu_data_dim2_;
}
lu_pivots_dim_ = lu_pivots_dims_vector[lu_pivots_dims - kFirstDim];
batch_num_ = lu_data_size_ / (lu_data_dim1_ * lu_data_dim2_);
workspace_size_list_.push_back(batch_num_ * lu_data_dim1_ * unit_size2_);
return KRET_OK;
}
bool LuUnpackGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
cuda_stream_ = stream_ptr;
return kernel_func_(this, inputs, workspace, outputs);
}
std::vector<KernelAttr> LuUnpackGpuKernelMod::GetOpSupport() {
std::vector<KernelAttr> support_list;
(void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
[](const std::pair<KernelAttr, LuUnpackFunc> &pair) { return pair.first; });
return support_list;
}
std::vector<std::pair<KernelAttr, LuUnpackGpuKernelMod::LuUnpackFunc>> LuUnpackGpuKernelMod::func_list_ = {
{KernelAttr()
.AddInputAttr(kNumberTypeFloat64)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64),
&LuUnpackGpuKernelMod::LaunchKernel<double, int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat64)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64),
&LuUnpackGpuKernelMod::LaunchKernel<double, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat64)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64),
&LuUnpackGpuKernelMod::LaunchKernel<double, int16_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat64)
.AddInputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64),
&LuUnpackGpuKernelMod::LaunchKernel<double, int8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat64)
.AddInputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64),
&LuUnpackGpuKernelMod::LaunchKernel<double, uint8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
&LuUnpackGpuKernelMod::LaunchKernel<float, int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
&LuUnpackGpuKernelMod::LaunchKernel<float, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
&LuUnpackGpuKernelMod::LaunchKernel<float, int16_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
&LuUnpackGpuKernelMod::LaunchKernel<float, int8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
&LuUnpackGpuKernelMod::LaunchKernel<float, uint8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
&LuUnpackGpuKernelMod::LaunchKernel<half, int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
&LuUnpackGpuKernelMod::LaunchKernel<half, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
&LuUnpackGpuKernelMod::LaunchKernel<half, int16_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
&LuUnpackGpuKernelMod::LaunchKernel<half, int8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
&LuUnpackGpuKernelMod::LaunchKernel<half, uint8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64),
&LuUnpackGpuKernelMod::LaunchKernel<int64_t, int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64),
&LuUnpackGpuKernelMod::LaunchKernel<int64_t, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64),
&LuUnpackGpuKernelMod::LaunchKernel<int64_t, int16_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64),
&LuUnpackGpuKernelMod::LaunchKernel<int64_t, int8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64),
&LuUnpackGpuKernelMod::LaunchKernel<int64_t, uint8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int16_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, uint8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int16_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, int8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGpuKernelMod::LaunchKernel<int32_t, uint8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt16)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16),
&LuUnpackGpuKernelMod::LaunchKernel<int16_t, int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt16)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16),
&LuUnpackGpuKernelMod::LaunchKernel<int16_t, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt16)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16),
&LuUnpackGpuKernelMod::LaunchKernel<int16_t, int16_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt16)
.AddInputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16),
&LuUnpackGpuKernelMod::LaunchKernel<int16_t, int8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt16)
.AddInputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16),
&LuUnpackGpuKernelMod::LaunchKernel<int16_t, uint8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt8)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8),
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt8)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8),
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt8)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8),
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int16_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt8)
.AddInputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8),
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt8)
.AddInputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8),
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, uint8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeUInt8)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8),
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeUInt8)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8),
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeUInt8)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8),
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int16_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeUInt8)
.AddInputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8),
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, int8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeUInt8)
.AddInputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8),
&LuUnpackGpuKernelMod::LaunchKernel<int8_t, uint8_t>}};
MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, LuUnpack, LuUnpackGpuKernelMod);
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,78 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LUUNPACK_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LUUNPACK_GPU_KERNEL_H_
#include <map>
#include <vector>
#include <string>
#include <memory>
#include <utility>
#include <algorithm>
#include <functional>
#include "mindspore/core/ops/lu_unpack.h"
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/factory/ms_factory.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_impl.cuh"
#include "plugin/device/gpu/hal/device/gpu_device_address.h"
#include "utils/check_convert_utils.h"
#include "abstract/ops/primitive_infer_map.h"
namespace mindspore {
constexpr size_t kInputNum = 2;
constexpr size_t kOutputNum = 3;
namespace kernel {
class LuUnpackGpuKernelMod : public NativeGpuKernelMod {
public:
LuUnpackGpuKernelMod() { ResetResource(); }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) override;
int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs, const std::map<uint32_t, tensor::TensorPtr> &) override;
protected:
std::vector<KernelAttr> GetOpSupport() override;
private:
using LuUnpackFunc =
std::function<bool(LuUnpackGpuKernelMod *, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
LuUnpackFunc kernel_func_;
void *cuda_stream_{nullptr};
static std::vector<std::pair<KernelAttr, LuUnpackFunc>> func_list_;
size_t lu_data_size_;
size_t lu_pivots_size_;
int64_t lu_data_dim1_;
int64_t lu_data_dim2_;
int64_t l_stride_;
int64_t u_stride_;
int64_t lu_pivots_dim_;
int64_t batch_num_;
size_t unit_size1_;
size_t unit_size2_;
void ResetResource() noexcept;
template <typename T_data, typename T_pivots>
bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs);
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LUUNPACK_GPU_KERNEL_H_

View File

@ -0,0 +1,213 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/gpu/kernel/math/lu_unpack_grad_gpu_kernel.h"
namespace mindspore {
namespace kernel {
namespace {
constexpr const size_t kLuUnpackGradInputNum = 3;
constexpr const size_t kLuUnpackGradOutputNum = 2;
constexpr const int64_t Min_Dim = 2;
} // namespace
bool LuUnpackGradGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) {
auto kernel_ptr = std::dynamic_pointer_cast<ops::LuUnpackGrad>(base_operator);
MS_ERROR_IF_NULL_W_RET_VAL(kernel_ptr, false);
kernel_name_ = kernel_ptr->name();
if (inputs.size() != kLuUnpackGradInputNum || outputs.size() != kLuUnpackGradOutputNum) {
MS_LOG(ERROR) << "For '" << kernel_name_ << "', input and output size must be " << kLuUnpackGradInputNum << " and "
<< kLuUnpackGradOutputNum << ", but got " << inputs.size() << " and " << outputs.size();
return false;
}
auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
auto pair = MatchKernelAttr(kernel_attr, GetOpSupport());
if (!pair.first) {
MS_LOG(ERROR) << "'" << kernel_name_ << "' does not support this kernel data type: " << kernel_attr;
return false;
}
kernel_func_ = func_list_[pair.second].second;
unit_size_ = abstract::TypeIdSize(inputs.at(kIndex0)->GetDtype());
input_L_shape = std::vector<int64_t>(inputs.at(kIndex0)->GetDeviceShapeAdaptively().begin(),
inputs.at(kIndex0)->GetDeviceShapeAdaptively().end());
input_U_shape = std::vector<int64_t>(inputs.at(kIndex1)->GetDeviceShapeAdaptively().begin(),
inputs.at(kIndex1)->GetDeviceShapeAdaptively().end());
input_LU_shape = std::vector<int64_t>(inputs.at(kIndex2)->GetDeviceShapeAdaptively().begin(),
inputs.at(kIndex2)->GetDeviceShapeAdaptively().end());
return true;
}
int LuUnpackGradGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &others) {
int ret = KernelMod::Resize(base_operator, inputs, outputs);
if (ret != KRET_OK) {
return ret;
}
ResetResource();
input_L_shape = std::vector<int64_t>(inputs.at(kIndex0)->GetDeviceShapeAdaptively().begin(),
inputs.at(kIndex0)->GetDeviceShapeAdaptively().end());
input_U_shape = std::vector<int64_t>(inputs.at(kIndex1)->GetDeviceShapeAdaptively().begin(),
inputs.at(kIndex1)->GetDeviceShapeAdaptively().end());
input_LU_shape = std::vector<int64_t>(inputs.at(kIndex2)->GetDeviceShapeAdaptively().begin(),
inputs.at(kIndex2)->GetDeviceShapeAdaptively().end());
int64_t input_L_elements_ =
std::accumulate(input_L_shape.begin(), input_L_shape.end(), int64_t(1), std::multiplies<int64_t>());
int64_t input_U_elements_ =
std::accumulate(input_U_shape.begin(), input_U_shape.end(), int64_t(1), std::multiplies<int64_t>());
int64_t input_LU_elements_ =
std::accumulate(input_LU_shape.begin(), input_LU_shape.end(), int64_t(1), std::multiplies<int64_t>());
input_elements_ = input_L_elements_ + input_U_elements_ + input_LU_elements_;
int64_t input_L_size = input_L_elements_ * unit_size_;
int64_t input_U_size = input_U_elements_ * unit_size_;
int64_t input_LU_size = input_LU_elements_ * unit_size_;
input_size_list_.emplace_back(input_L_size);
input_size_list_.emplace_back(input_U_size);
input_size_list_.emplace_back(input_LU_size);
output_size_list_.emplace_back(input_LU_size);
output_size_list_.emplace_back(input_LU_size);
workspace_size_list_.emplace_back(input_L_size);
workspace_size_list_.emplace_back(input_U_size);
return KRET_OK;
}
void LuUnpackGradGpuKernelMod::ResetResource() noexcept {
input_elements_ = 0;
is_null_input_ = false;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}
template <typename T>
bool LuUnpackGradGpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs) {
T *l_grad_input = GetDeviceAddress<T>(inputs, kIndex0);
T *u_grad_input = GetDeviceAddress<T>(inputs, kIndex1);
T *l_grad_output = GetDeviceAddress<T>(outputs, kIndex0);
T *u_grad_output = GetDeviceAddress<T>(outputs, kIndex1);
int64_t lu_data_dims = input_LU_shape.size();
int64_t lu_data_elem_num =
std::accumulate(input_LU_shape.begin(), input_LU_shape.end(), int64_t(1), std::multiplies<int64_t>());
int64_t lu_data_height = input_LU_shape[lu_data_dims - 2];
int64_t lu_data_width = input_LU_shape[lu_data_dims - 1];
int64_t lu_data_stride = lu_data_height * lu_data_width;
int64_t matrix_num = lu_data_elem_num / lu_data_stride;
int64_t output_stride = lu_data_stride;
int64_t input_L_dims = input_L_shape.size();
int64_t input_U_dims = input_U_shape.size();
int64_t matrix_L_width = input_L_shape[input_L_dims - 2];
int64_t matrix_L_height = input_L_shape[input_L_dims - 1];
int64_t matrix_U_width = input_U_shape[input_U_dims - 2];
int64_t matrix_U_height = input_U_shape[input_U_dims - 1];
if (lu_data_width > lu_data_height) {
CalTrilExpendWidth(matrix_num * output_stride, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output,
lu_data_height, lu_data_width, device_id_, stream_ptr_);
CalTriuUpper(matrix_num * output_stride, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output,
lu_data_height, lu_data_width, device_id_, stream_ptr_);
} else {
if (lu_data_height > lu_data_width) {
CalTrilLower(matrix_num * output_stride, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output,
lu_data_height, lu_data_width, device_id_, stream_ptr_);
CalTriuExpendHeight(matrix_num * output_stride, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output,
lu_data_height, lu_data_width, device_id_, stream_ptr_);
} else {
CalTrilLower(matrix_num * output_stride, l_grad_input, matrix_L_height, matrix_L_width, l_grad_output,
lu_data_height, lu_data_width, device_id_, stream_ptr_);
CalTriuUpper(matrix_num * output_stride, u_grad_input, matrix_U_height, matrix_U_width, u_grad_output,
lu_data_height, lu_data_width, device_id_, stream_ptr_);
}
}
return true;
}
std::vector<std::pair<KernelAttr, LuUnpackGradGpuKernelMod::LuUnpackGradFunc>> LuUnpackGradGpuKernelMod::func_list_ = {
{KernelAttr()
.AddInputAttr(kNumberTypeFloat64)
.AddInputAttr(kNumberTypeFloat64)
.AddInputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64),
&LuUnpackGradGpuKernelMod::LaunchKernel<double>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
&LuUnpackGradGpuKernelMod::LaunchKernel<float>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
&LuUnpackGradGpuKernelMod::LaunchKernel<half>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64),
&LuUnpackGradGpuKernelMod::LaunchKernel<int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&LuUnpackGradGpuKernelMod::LaunchKernel<int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt16)
.AddInputAttr(kNumberTypeInt16)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16),
&LuUnpackGradGpuKernelMod::LaunchKernel<int16_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt8)
.AddInputAttr(kNumberTypeInt8)
.AddInputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8)
.AddOutputAttr(kNumberTypeInt8),
&LuUnpackGradGpuKernelMod::LaunchKernel<int8_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeUInt8)
.AddInputAttr(kNumberTypeUInt8)
.AddInputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8)
.AddOutputAttr(kNumberTypeUInt8),
&LuUnpackGradGpuKernelMod::LaunchKernel<uint8_t>}};
std::vector<KernelAttr> LuUnpackGradGpuKernelMod::GetOpSupport() {
std::vector<KernelAttr> support_list;
(void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
[](const std::pair<KernelAttr, LuUnpackGradFunc> &pair) { return pair.first; });
return support_list;
}
MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, LuUnpackGrad, LuUnpackGradGpuKernelMod);
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,81 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_UNPACK_GRAD_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_UNPACK_GRAD_GPU_KERNEL_H_
#include <vector>
#include <map>
#include <functional>
#include <algorithm>
#include <utility>
#include "mindspore/core/ops/grad/lu_unpack_grad.h"
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/factory/ms_factory.h"
#include "plugin/device/gpu/hal/device/gpu_device_address.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/lu_unpack_grad_impl.cuh"
namespace mindspore {
namespace kernel {
class LuUnpackGradGpuKernelMod : public NativeGpuKernelMod {
public:
LuUnpackGradGpuKernelMod() { ResetResource(); }
~LuUnpackGradGpuKernelMod() override = default;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
if (is_null_input_) {
return true;
}
VARIABLE_NOT_USED(workspace);
stream_ptr_ = reinterpret_cast<cudaStream_t>(stream_ptr);
return kernel_func_(this, inputs, workspace, outputs);
}
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) override;
int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &others = std::map<uint32_t, tensor::TensorPtr>()) override;
std::vector<KernelAttr> GetOpSupport() override;
void ResetResource() noexcept;
private:
template <typename T>
bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs);
using LuUnpackGradFunc =
std::function<bool(LuUnpackGradGpuKernelMod *, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
static std::vector<std::pair<KernelAttr, LuUnpackGradFunc>> func_list_;
LuUnpackGradFunc kernel_func_;
bool is_null_input_{false};
int64_t unit_size_{1};
int64_t input_elements_;
std::vector<int64_t> input_L_shape;
std::vector<int64_t> input_U_shape;
std::vector<int64_t> input_LU_shape;
cudaStream_t stream_ptr_{nullptr};
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_LU_UNPACK_GRAD_GPU_KERNEL_H_

View File

@ -45,6 +45,7 @@ abstract::TupleShapePtr LuUnpackGradInferShape(const PrimitivePtr &primitive,
auto LU_data_dim1 = LU_data_shape[LU_data_shape.size() - 2];
auto LU_data_dim2 = LU_data_shape[LU_data_shape.size() - 1];
int64_t LU_data_min = std::min(LU_data_dim1, LU_data_dim2);
if (LU_data_dim1 != L_data_dim1) {
MS_EXCEPTION(ValueError) << "For '" << primitive->name()
<< "', L_grad's data dim[-2] and LU_data's dim[-2] should be same.";

View File

@ -42,6 +42,7 @@ abstract::TupleShapePtr LuUnpackInferShape(const PrimitivePtr &primitive,
auto LU_data_shape = LU_data_shape_map[kShape];
auto LU_pivots_shape_map = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[1]->BuildShape());
auto LU_pivots_shape = LU_pivots_shape_map[kShape];
if (IsDynamicRank(LU_data_shape)) {
auto output_shpe = std::make_shared<abstract::Shape>(LU_data_shape);
return std::make_shared<abstract::TupleShape>(

View File

@ -6388,7 +6388,7 @@ class LuUnpack(Primitive):
RuntimeError: On the Ascend platform, if the value of `LU_pivots` are out of range[1, LU_data.shape[-2]).
Supported Platforms:
``Ascend`` ``CPU``
``Ascend`` ``GPU`` ``CPU``
Examples:
>>> LU_data = Tensor(np.array([[[-0.3806, -0.4872, 0.5536],
@ -6399,11 +6399,32 @@ class LuUnpack(Primitive):
... [ 0.1015, -0.5363, 0.6165]]]), mstype.float32)
>>> LU_pivots = Tensor(np.array([[1, 3, 3],
... [2, 3, 3]]), mstype.int32)
>>> lu_unpack = ops.LuUnpack()
>>> pivots, L, U = lu_unpack(LU_data, LU_pivots, unpack_data, unpack_pivots)
>>> lu_unpack = LuUnpack()
>>> pivots, L, U = lu_unpack(LU_data, LU_pivots)
>>> print(pivots)
[[[1. 0. 0.]
[0. 0. 1.]
[0. 1. 0.]]
<BLANKLINE>
[[0. 0. 1.]
[1. 0. 0.]
[0. 1. 0.]]]
>>> print(L)
[[[ 1. 0. 0. ]
[-0.1287 1. 0. ]
[ 0.2583 0.5239 1. ]]
<BLANKLINE>
[[ 1. 0. 0. ]
[-0.6401 1. 0. ]
[ 0.1015 -0.5363 1. ]]]
>>> print(U)
[[[-0.3806 -0.4872 0.5536]
[ 0. 0.6508 -0.2396]
[ 0. 0. 0.6902]]
<BLANKLINE>
[[ 0.6706 -1.1782 0.4574]
[ 0. -0.4779 0.6701]
[ 0. 0. 0.6165]]]
"""
@prim_attr_register
@ -6491,10 +6512,6 @@ class Polygamma(Primitive):
.. math::
\psi^{(a)}(x) = \frac{d^{(a)}}{dx^{(a)}} \psi(x)
Args:
:math:`a \geq 0`: the order of the polygamma function.
input (Tensor): the tensor to compute the polygamma function.
Inputs:
- **a** (Tensor) - The order of the polygamma function, types: int32, int64.
- **x** (Tensor) - The input tensor, types: float16, float32, float64.

View File

@ -0,0 +1,126 @@
# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import numpy as np
import pytest
import torch
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.ops.operations import _grad_ops as G
import mindspore.common.dtype as mstype
class NetLuUnpackGrad(nn.Cell):
def __init__(self, l_grad_flag=True, u_grad_flag=True):
super().__init__()
self.lu_unpack_grad = G.LuUnpackGrad(L_grad_flag=l_grad_flag, U_grad_flag=u_grad_flag)
def construct(self, l_grad, u_grad, lu_data):
return self.lu_unpack_grad(l_grad, u_grad, lu_data)
def getl(a_lu, pivots):
a_lu_float = a_lu.float()
a_lu_torch = torch.tensor(a_lu_float, requires_grad=True)
out_torch = torch.lu_unpack(a_lu_torch, pivots)
out_torch[1].backward(gradient=out_torch[1])
return a_lu_torch.grad
def getu(a_lu, pivots):
a_lu_float = a_lu.float()
a_lu_torch = torch.tensor(a_lu_float, requires_grad=True)
out_torch = torch.lu_unpack(a_lu_torch, pivots)
out_torch[2].backward(gradient=out_torch[2])
return a_lu_torch.grad
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_lu_unpack_grad_graph_float():
"""
Feature: LuUnpackGrad gpu TEST.
Description: 4d - float32 test case for LuUnpackGrad
Expectation: the result match to numpy
"""
a = torch.randn(3, 3, 3, 3)
loss = 1e-5
a_lu, pivots = a.lu()
out = torch.lu_unpack(a_lu, pivots)
a_l_torch = getl(a_lu, pivots)
a_u_torch = getu(a_lu, pivots)
a_l_mindspore = Tensor(out[1].numpy(), mstype.float32)
a_u_mindspore = Tensor(out[2].numpy(), mstype.float32)
a_lu_mindspore = Tensor(a_lu.numpy(), mstype.float32)
net = NetLuUnpackGrad(l_grad_flag=True, u_grad_flag=True)
l_grad_mindspore, u_grad_mindspore = net(a_l_mindspore, a_u_mindspore, a_lu_mindspore)
assert np.allclose(a_l_torch, torch.tensor(l_grad_mindspore.asnumpy()), atol=loss)
assert np.allclose(a_u_torch, torch.tensor(u_grad_mindspore.asnumpy()), atol=loss)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_lu_unpack_grad_pynative_float():
"""
Feature: LuUnpackGrad gpu TEST.
Description: 4d - float32 test case for LuUnpackGrad
Expectation: the result match to numpy
"""
a = torch.randn(5, 4, 3, 2)
loss = 1e-5
a_lu, pivots = a.lu()
out = torch.lu_unpack(a_lu, pivots)
a_l_torch = getl(a_lu, pivots)
a_u_torch = getu(a_lu, pivots)
a_l_mindspore = Tensor(out[1].numpy(), mstype.float32)
a_u_mindspore = Tensor(out[2].numpy(), mstype.float32)
a_lu_mindspore = Tensor(a_lu.numpy(), mstype.float32)
net = NetLuUnpackGrad(l_grad_flag=True, u_grad_flag=True)
l_grad_mindspore, u_grad_mindspore = net(a_l_mindspore, a_u_mindspore, a_lu_mindspore)
assert np.allclose(a_l_torch, torch.tensor(l_grad_mindspore.asnumpy()), atol=loss)
assert np.allclose(a_u_torch, torch.tensor(u_grad_mindspore.asnumpy()), atol=loss)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_lu_unpack_grad_pynative_float_error():
"""
Feature: LuUnpackGrad gpu TEST.
Description: 4d - uint16 test the unsupported type for LuUnpackGrad
Expectation: the result match to numpy
"""
a = torch.randn(8, 7, 4, 2)
a_lu, pivots = a.lu()
out = torch.lu_unpack(a_lu, pivots)
a_l_mindspore = Tensor(out[1].numpy(), mstype.uint16)
a_u_mindspore = Tensor(out[2].numpy(), mstype.uint16)
a_lu_mindspore = Tensor(a_lu.numpy(), mstype.uint16)
with pytest.raises(TypeError):
net = NetLuUnpackGrad(l_grad_flag=True, u_grad_flag=True)
net(a_l_mindspore, a_u_mindspore, a_lu_mindspore)

View File

@ -0,0 +1,113 @@
# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import numpy as np
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.ops.operations.math_ops import LuUnpack
import mindspore.common.dtype as mstype
import torch
import pytest
class LuUnpackNet(nn.Cell):
def __init__(self, unpack_data=True, unpack_pivots=True):
super(LuUnpackNet, self).__init__()
self.lu_unpack = LuUnpack(unpack_data=unpack_data, unpack_pivots=unpack_pivots)
def construct(self, LU_data, LU_pivots):
return self.lu_unpack(LU_data, LU_pivots)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_lu_unpack_float32_int64():
"""
Feature: ALL To ALL
Description: test cases for LuUnpack
Expectation: the result match to torch
"""
my_atol = 1e-5
data = torch.randn(10, 7, 8, 9)
a_lu, pivots = data.lu()
p, a_l, a_u = torch.lu_unpack(a_lu, pivots)
my_a_lu = Tensor(a_lu.numpy(), mstype.float32)
my_pivots = Tensor(pivots.numpy(), mstype.int64)
net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
my_p, my_a_l, my_a_u = net(my_a_lu, my_pivots)
assert np.allclose(p.numpy(), my_p.asnumpy(), atol=my_atol)
assert np.allclose(a_l.numpy(), my_a_l.asnumpy(), atol=my_atol)
assert np.allclose(a_u.numpy(), my_a_u.asnumpy(), atol=my_atol)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_lu_unpack_float64_int32():
"""
Feature: ALL To ALL
Description: test cases for LuUnpack
Expectation: the result match to torch
"""
my_atol = 1e-5
data = torch.randn(7, 8, 9, 10)
a_lu, pivots = data.lu()
p, a_l, a_u = torch.lu_unpack(a_lu, pivots)
my_a_lu = Tensor(a_lu.numpy(), mstype.float64)
my_pivots = Tensor(pivots.numpy(), mstype.int32)
net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
my_p, my_a_l, my_a_u = net(my_a_lu, my_pivots)
assert np.allclose(p.numpy(), my_p.asnumpy(), atol=my_atol)
assert np.allclose(a_l.numpy(), my_a_l.asnumpy(), atol=my_atol)
assert np.allclose(a_u.numpy(), my_a_u.asnumpy(), atol=my_atol)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_lu_unpack_float64_int16():
"""
Feature: ALL To ALL
Description: test cases for LuUnpack
Expectation: the result match to torch
"""
my_atol = 1e-5
data = torch.randn(4, 6, 8, 10)
a_lu, pivots = data.lu()
p, a_l, a_u = torch.lu_unpack(a_lu, pivots)
my_a_lu = Tensor(a_lu.numpy(), mstype.float64)
my_pivots = Tensor(pivots.numpy(), mstype.int16)
net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
my_p, my_a_l, my_a_u = net(my_a_lu, my_pivots)
assert np.allclose(p.numpy(), my_p.asnumpy(), atol=my_atol)
assert np.allclose(a_l.numpy(), my_a_l.asnumpy(), atol=my_atol)
assert np.allclose(a_u.numpy(), my_a_u.asnumpy(), atol=my_atol)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_lu_unpack_input_error():
"""
Feature: ALL To ALL
Description: test cases for LuUnpack
Expectation: raise ValueError
"""
my_a_lu = Tensor(np.random.randn(3, 3, 2, 2).astype(np.float32))
my_pivots = Tensor(np.random.randn(3, 3, 5, 3).astype(np.int32))
with pytest.raises(ValueError):
net = LuUnpackNet(unpack_data=True, unpack_pivots=True)
net(my_a_lu, my_pivots)