diff --git a/cmake/package.cmake b/cmake/package.cmake index da75fd07177..8ed2e7d448c 100644 --- a/cmake/package.cmake +++ b/cmake/package.cmake @@ -212,6 +212,11 @@ if(ENABLE_GPU) DESTINATION ${INSTALL_LIB_DIR} COMPONENT mindspore ) + install( + TARGETS cuda_ops + DESTINATION ${INSTALL_LIB_DIR} + COMPONENT mindspore + ) endif() if(ENABLE_D) diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index 6bca1693595..46114fd7c41 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -114,6 +114,8 @@ if(ENABLE_GPU) "plugin/device/gpu/kernel/*.cu" ) + list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/kernel/cuda_impl/cuda_ops/*.cu") + list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr) list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/hal/device/blocking_queue.cc" "plugin/device/gpu/hal/device/gpu_buffer_mgr.cc") @@ -145,6 +147,8 @@ if(ENABLE_GPU) cuda_add_library(gpu_cuda_lib STATIC ${GPU_SRC_LIST}) set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS}) add_compile_definitions(ENABLE_GPU) + + add_subdirectory(plugin/device/gpu/kernel/cuda_impl/cuda_ops) endif() @@ -430,7 +434,7 @@ endif() if(ENABLE_GPU) message("add gpu lib to c_expression") - target_link_libraries(_c_expression PRIVATE gpu_cuda_lib gpu_queue cublas + target_link_libraries(_c_expression PRIVATE gpu_cuda_lib gpu_queue cublas cuda_ops ${CUDA_PATH}/lib64/libcurand.so ${CUDNN_LIBRARY_PATH} ${CUDA_PATH}/lib64/libcudart.so diff --git a/mindspore/ccsrc/cxx_api/CMakeLists.txt b/mindspore/ccsrc/cxx_api/CMakeLists.txt index 4502114e38b..9db16b44d7e 100644 --- a/mindspore/ccsrc/cxx_api/CMakeLists.txt +++ b/mindspore/ccsrc/cxx_api/CMakeLists.txt @@ -140,7 +140,7 @@ if(ENABLE_D) endif() if(ENABLE_GPU) - target_link_libraries(mindspore_shared_lib PRIVATE gpu_cuda_lib gpu_queue cublas + target_link_libraries(mindspore_shared_lib PRIVATE gpu_cuda_lib gpu_queue cublas cuda_ops ${CUDA_PATH}/lib64/libcurand.so ${CUDNN_LIBRARY_PATH} ${CUDA_PATH}/lib64/libcudart.so diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc b/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc index 1b4f2f5f277..77e6aa1cb12 100644 --- a/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc +++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc @@ -26,7 +26,7 @@ #include "kernel/oplib/oplib.h" #include "backend/common/session/anf_runtime_algorithm.h" #include "plugin/device/gpu/kernel/custom/custom_aot_gpu_kernel.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "utils/ms_context.h" #include "utils/ms_utils.h" #include "utils/utils.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc index 98d00fdd667..40d77f8d8b4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc +++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc @@ -28,7 +28,7 @@ #include "plugin/device/gpu/hal/device/gpu_buffer_mgr.h" #include "kernel/common_utils.h" #include "plugin/device/gpu/hal/device/gpu_common.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/hal/hardware/optimizer.h" #include "utils/ms_device_shape_transfer.h" #include "utils/context/graph_kernel_flags.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h index 1dc20713dcf..cd09eb7013e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h index 8644187a16e..72cbed431f1 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h index 56ab101e245..f6a7b8b3a6e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h index ca70b194841..b687014f980 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h index 73ed5971427..664529e9ae9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h index b79dd918910..c81ac71a433 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h index 5e00ba80955..5a9c18ee4e5 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h index 02e33795e73..8b3458dd759 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h index 7ef4a9889a8..a4efb38bae0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h @@ -21,7 +21,7 @@ #include -#include "plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h index f8102103578..52579606904 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h index 7355325d270..52d091066d2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h @@ -23,8 +23,8 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h index 4de4011a5d3..5beb63dcb3b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/gather.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h index 045d0ee49ce..a106a12fb3b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h index 32022f027a0..70ea77f3805 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/gathernd.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh" #include "backend/common/session/anf_runtime_algorithm.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h index 04a786f5459..c4a63212d4f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh" #include "backend/common/session/anf_runtime_algorithm.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h index d460e34db07..d75de64118e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h @@ -22,9 +22,9 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h index b4ddfcefb48..a8538d0fcda 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h @@ -23,9 +23,9 @@ #include #include #include -#include "utils/complex.h" -#include "plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h index 48f9a2776fe..8527e876bba 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h @@ -25,9 +25,9 @@ #include #include #include -#include "utils/complex.h" -#include "plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "kernel/common_utils.h" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h index 658eb666738..b0bc14f5744 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "kernel/common_utils.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h index 52ba3e02d32..3271c11e2f9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h @@ -22,8 +22,8 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h index b1f2b2b4fe0..cf2893f063b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h index fad533cf796..8d3efd720fb 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h @@ -19,7 +19,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops//oneslike_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h index d945d1f92aa..1e9dc564d18 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/pack.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h index 13da8525029..7acade9fcde 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h @@ -19,7 +19,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/range_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cuh" namespace mindspore { namespace kernel { constexpr float kStartDefault = 0.; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h index fe3c9617f54..99cdf519e85 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h index 06921785fad..b85f75b2347 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h index bc46f767c85..d073c531019 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h @@ -22,8 +22,8 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h index dbcbaef9101..251e856fc78 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h @@ -22,7 +22,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h index 0acacb8208b..b960bee68fb 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h index c52a96b63ba..7d512f1e826 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h index 9b042838429..79b1a9f7559 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h @@ -19,7 +19,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h index b3da2563db2..12bb75f4245 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/select_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h index 192cf9af970..36b8f57396d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h index e2b3f1ba47d..1291ac7204c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h index 3b9ea35551a..0cb93e68e98 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h @@ -24,9 +24,9 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h index e37838dfe29..81ffb8ee054 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h index f090ed396b2..7edb4c39205 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h index 3e92763552d..c3016b7ad73 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/split_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h index f73e80dd794..86a44e7c3db 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h @@ -23,7 +23,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/arrays/strided_slice_gpu_common.h" -#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h index 31a1a5b963c..0354b2cac75 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h @@ -23,7 +23,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/arrays/strided_slice_gpu_common.h" -#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h index 4b79e3873b0..1afea2fd7c6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h @@ -25,7 +25,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "kernel/common_utils.h" -#include "plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h index 34923826749..d4a3d268ea5 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h index 57f15c53a38..ba7fdb4b693 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h index d16e3c31e1c..60af63b88b0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h @@ -19,7 +19,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h index 9cc4e01841b..6d1a44c9864 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h index afc25e08221..6f110d216e7 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h @@ -19,7 +19,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h index b27e0747d20..57eec01d03b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h index d0a1ed551e8..9bef68dfbaf 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h @@ -21,8 +21,8 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h index 7ed3f305b07..9990dc0b04c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h @@ -21,8 +21,8 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh" namespace mindspore { namespace kernel { constexpr size_t kDimSize4 = 4; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h index 1970d2ff4dd..a27a62d0585 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/unique_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h index 563a27c9f98..e8265dcbe7a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/unpack.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h index 70d03c13279..d94c242202c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h index 6c7ca40378a..915e098f585 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h index c4b04629717..5e8c9e83a52 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh deleted file mode 100644 index 8f76c51e01a..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void ApplyAdagrad(const size_t size, - const bool update_slots, - const S *learning_rate, - const G *gradient, - T *variable, - T *accumulation, - cudaStream_t stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh deleted file mode 100644 index 65a388b1fb2..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void ApplyAdam(const size_t size, const T *gradient, const T *beta1_power, const T *beta2_power, const T *learning_rate, - const T *beta1, const T *beta2, const T *epsilon, T *variable, T *m, T *v, cudaStream_t cuda_stream); -template -void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learning_rate, const float *beta1, - const float *beta2, const float *epsilon, const float *decay, T *variable, T *m, T *v, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cuh deleted file mode 100644 index 2addffbf002..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cuh +++ /dev/null @@ -1,24 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ADAM_WEIGHT_DECAY_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ADAM_WEIGHT_DECAY_H_ -template -void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1, const float *one_sub_beta1, - const float *beta2, const float *one_sub_beta2, const float *epsilon, const float *lr, - const float *weight_decay, T *m, T *v, T *param, T *gradient, cudaStream_t stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ADAM_WEIGHT_DECAY_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh deleted file mode 100644 index b8c12da9774..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const uint input_width, - const uint output_height, const uint output_width, T *input_data, - T *output_data, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh deleted file mode 100644 index caa0418ed38..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint input_width, const uint output_height, - const uint output_width, T *input_data, T *output_data, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh deleted file mode 100644 index 81e10d1d49e..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask, cudaStream_t cuda_stream); - -template -void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh deleted file mode 100755 index 5b80eb85b48..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh +++ /dev/null @@ -1,23 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_ -template -void CalArgmax(const T *input, const S bound, const size_t outer_size, const size_t inner_size, S *output, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh deleted file mode 100644 index f60b061b3df..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void BatchNormFold2Forward(const T *x, const T *beta, const T *gamma, const T *batch_std, const T *batch_mean, - const T *running_std, const T *running_mean, const int *global_step, T *y, int freeze_bn, - size_t N, size_t C, size_t H, size_t W, cudaStream_t cuda_stream); -template -void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std, - const T *running_mean, const T *running_std, const T *gamma, T *d_gamma, - T *d_batch_mean, T *d_batch_std, size_t C, cudaStream_t cuda_stream); -template -void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std, - const T *running_mean, const T *running_std, const T *gamma, T *d_gamma, - T *d_batch_mean, T *d_batch_std, size_t C, cudaStream_t cuda_stream); -template -void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *reduce_x, T *tmp2, T *tmp_x, size_t N, - size_t C, size_t H, size_t W, cudaStream_t cuda_stream); - -template -void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_std, T *d_x, size_t N, size_t C, size_t H, - size_t W, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh deleted file mode 100755 index d7ad76c5adc..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORM_FOLD_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORM_FOLD_H_ - -template -void CalUpdateRunningStd(int channel_size, double epsilon, T* running_std, cudaStream_t cuda_stream); - -template -void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream); - -template -void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T* x, const T* batch_mean, - const T* batch_std, int batch_size, int channel_size, int height, int width, T* dx, - cudaStream_t cuda_stream); -template -void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BATCHNORM_FOLD_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cu deleted file mode 100644 index 4ff5c230f42..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cu +++ /dev/null @@ -1,133 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include "batchtospace_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void BatchToSpace(const size_t size, const T *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - T *output) { - size_t temp_stride = 0; - size_t temp_pos = 0; - size_t idx_on = 0; - size_t idx_oc = 0; - size_t idx_oh = 0; - size_t idx_ow = 0; - size_t idx_in = 0; - size_t input_pos = 0; - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; - pos += blockDim.x * gridDim.x) { - temp_stride = oc * oh * ow; - idx_on = pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= oc; - idx_oc = temp_pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= oh; - idx_oh = temp_pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= ow; - idx_ow = temp_pos / temp_stride; - - idx_in = (((idx_oh + crop_up) % block_num) * block_num + ((idx_ow + crop_lft) % block_num)) * on + idx_on; - input_pos = idx_in * ic; - input_pos = (input_pos + idx_oc) * ih; - input_pos = (input_pos + ((idx_oh + crop_up) - (idx_in / (on * block_num))) / block_num) * iw; - input_pos = (input_pos + ((idx_ow + crop_lft) - ((idx_in / on) % block_num)) / block_num); - output[pos] = input[input_pos]; - } - return; -} - -template -void CalBatchToSpace(const size_t size, const T *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - T *output, cudaStream_t cuda_stream) { - BatchToSpace<<>>( - size, input, in, ih, iw, ic, on, oh, ow, oc, crop_up, crop_dn, crop_lft, crop_rht, block_num, output); - return; -} - -template void CalBatchToSpace(const size_t size, const float *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - float *output, cudaStream_t cuda_stream); -template void CalBatchToSpace(const size_t size, const half *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - half *output, cudaStream_t cuda_stream); -template void CalBatchToSpace(const size_t size, const int *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - int *output, cudaStream_t cuda_stream); -template void CalBatchToSpace(const size_t size, const int64_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - int64_t *output, cudaStream_t cuda_stream); -template void CalBatchToSpace(const size_t size, const int16_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - int16_t *output, cudaStream_t cuda_stream); -template void CalBatchToSpace(const size_t size, const int8_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - int8_t *output, cudaStream_t cuda_stream); -template void CalBatchToSpace(const size_t size, const uint8_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - uint8_t *output, cudaStream_t cuda_stream); -template void CalBatchToSpace(const size_t size, const uint16_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - uint16_t *output, cudaStream_t cuda_stream); -template void CalBatchToSpace(const size_t size, const uint32_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - uint32_t *output, cudaStream_t cuda_stream); -template void CalBatchToSpace(const size_t size, const uint64_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - uint64_t *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh deleted file mode 100644 index cbf6a3976a6..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_ -template -void CalBatchToSpace(const size_t size, const T *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t crop_up, const size_t crop_dn, - const size_t crop_lft, const size_t crop_rht, const size_t block_num, - T *output, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh deleted file mode 100644 index 7654f111033..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_ - -#define MAX_LOGITS_DIMENSION 8 -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void CalBCEWithLogitsLoss(const size_t input_size, const T *predict, const T *target, const size_t *input_shape, - const size_t shape_size, const T *weight, const size_t *weight_shape, - const bool weight_need_broadcast, const T *pos_weight, const size_t *pos_weight_shape, - const bool pos_weight_need_broadcast, T *shape_broadcasted, T *output, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh deleted file mode 100644 index 40dd5099640..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, - const T* dy, T* db, cudaStream_t cuda_stream); -template -void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width, - const T* dy, T* db, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh deleted file mode 100644 index ed459c57f15..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bboxes, const float &m1, const float &m2, - const float &m3, const float &m4, const float &s1, const float &s2, const float &s3, - const float &s4, const int &max_height, const int &max_width, const float &ratio_clip, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh deleted file mode 100644 index f3345090b95..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas, const float &m1, - const float &m2, const float &m3, const float &m4, const float &s1, const float &s2, - const float &s3, const float &s4, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh deleted file mode 100644 index 7fbc486ace7..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -enum BroadcastGradOpType { - BROADCAST_GRAD_TYPE_MAXIMUM = 0, - BROADCAST_GRAD_TYPE_MINIMUM = 1, - BROADCAST_GRAD_TYPE_INVALID = 0xffffffff, -}; - -template -void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, - const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, - const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const T *x1, const T *x2, - const T *dy, T *dx1, T *dx2, cudaStream_t stream); - -template -void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, - const T *x1, const T *x2, const T *dy, T *dx1, T *dx2, cudaStream_t stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh deleted file mode 100644 index 6d17c2a5c84..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_ - -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "utils/complex.h" - -const float kFloatEplison = 1e-37; - -enum BroadcastOpType { - BROADCAST_TYPE_GREATER = 0, - BROADCAST_TYPE_LESS = 1, - BROADCAST_TYPE_MAXIMUM = 2, - BROADCAST_TYPE_MINIMUM = 3, - BROADCAST_TYPE_POWER = 4, - BROADCAST_TYPE_REALDIV = 5, - BROADCAST_TYPE_MUL = 6, - BROADCAST_TYPE_SUB = 7, - BROADCAST_TYPE_ADD = 8, - BROADCAST_TYPE_FLOORDIV = 9, - BROADCAST_TYPE_ABSGRAD = 10, - BROADCAST_TYPE_DIV = 11, - BROADCAST_TYPE_DIVNONAN = 12, - BROADCAST_TYPE_EQUAL = 13, - BROADCAST_TYPE_SQUARED_DIFFERENCE = 14, - BROADCAST_TYPE_MOD = 15, - BROADCAST_TYPE_FLOORMOD = 16, - BROADCAST_TYPE_ATAN2 = 17, - BROADCAST_TYPE_GREATER_EQUAL = 18, - BROADCAST_TYPE_LESS_EQUAL = 19, - BROADCAST_TYPE_NOT_EQUAL = 20, - BROADCAST_TYPE_LOGICAL_AND = 21, - BROADCAST_TYPE_LOGICAL_OR = 22, - BROADCAST_TYPE_TRUNCATEDIV = 23, - BROADCAST_TYPE_TRUNCATEMOD = 24, - BROADCAST_TYPE_COMPLEX = 25, - BROADCAST_TYPE_INVALID = 0xffffffff, -}; - -template -void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream); - -template -void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream); - -template -void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const T1 *x0, const T2 *x1, - Complex *y, cudaStream_t stream); - -template -void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, - cudaStream_t stream); - -template -void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const T *x0, const T *x1, T *y, - cudaStream_t stream); - -template -void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const T1 *x0, const T2 *x1, - Complex *y, cudaStream_t stream); -template -void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const T *x0, const T *x1, - Complex *y, cudaStream_t stream); - -template -void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0, - const size_t &o1, const size_t &o2, const size_t &o3, const T *input_addr, T *output_addr, - cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cu deleted file mode 100644 index 63e48c0fc08..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cu +++ /dev/null @@ -1,318 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -// Generic cast -template -__device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) { - *output_addr = static_cast((*input_addr)); -} - -// half --> integer -__device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) { - *output_addr = __half2ull_rz((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) { - *output_addr = __half2ll_rz((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) { - *output_addr = __half2uint_rz((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) { - *output_addr = __half2int_rz((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const half *input_addr, uint16_t *output_addr) { - *output_addr = __half2ushort_rz((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const half *input_addr, int16_t *output_addr) { - *output_addr = __half2short_rz((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const half *input_addr, uint8_t *output_addr) { - *output_addr = static_cast(__half2ushort_rz((*input_addr))); -} - -__device__ __forceinline__ void CastBase(const half *input_addr, int8_t *output_addr) { - *output_addr = static_cast(__half2short_rz((*input_addr))); -} - -// integer --> half -__device__ __forceinline__ void CastBase(const uint64_t *input_addr, half *output_addr) { - *output_addr = __ull2half_rn((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const int64_t *input_addr, half *output_addr) { - *output_addr = __ll2half_rn((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const uint32_t *input_addr, half *output_addr) { - *output_addr = __uint2half_rn((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const int32_t *input_addr, half *output_addr) { - *output_addr = __int2half_rn((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const uint16_t *input_addr, half *output_addr) { - *output_addr = __ushort2half_rn((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const int16_t *input_addr, half *output_addr) { - *output_addr = __short2half_rn((*input_addr)); -} - -__device__ __forceinline__ void CastBase(const uint8_t *input_addr, half *output_addr) { - *output_addr = __ushort2half_rn(static_cast(*input_addr)); -} - -__device__ __forceinline__ void CastBase(const int8_t *input_addr, half *output_addr) { - *output_addr = __short2half_rn(static_cast(*input_addr)); -} - -// Cast -template -__global__ void CastKernel(const int input_size, const S *input_addr, T *output_addr) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < input_size; pos += blockDim.x * gridDim.x) { - CastBase(input_addr + pos, output_addr + pos); - } -} - -template -void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream) { - CastKernel<<>>(input_size, input_addr, output_addr); -} - -template void Cast(const int input_size, const int8_t *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int8_t *input_addr, Complex *output_addr, cudaStream_t stream); - -template void Cast(const int input_size, const int16_t *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int16_t *input_addr, Complex *output_addr, cudaStream_t stream); - - -template void Cast(const int input_size, const int32_t *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int32_t *input_addr, Complex *output_addr, cudaStream_t stream); - - -template void Cast(const int input_size, const int64_t *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const int64_t *input_addr, Complex *output_addr, cudaStream_t stream); - -template void Cast(const int input_size, const uint8_t *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint8_t *input_addr, Complex *output_addr, cudaStream_t stream); - -template void Cast(const int input_size, const uint16_t *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint16_t *input_addr, Complex *output_addr, cudaStream_t stream); - -template void Cast(const int input_size, const uint32_t *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint32_t *input_addr, Complex *output_addr, cudaStream_t stream); - -template void Cast(const int input_size, const uint64_t *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const uint64_t *input_addr, Complex *output_addr, cudaStream_t stream); - -template void Cast(const int input_size, const half *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const half *input_addr, Complex *output_addr, cudaStream_t stream); - -template void Cast(const int input_size, const float *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const float *input_addr, Complex *output_addr, cudaStream_t stream); - -template void Cast(const int input_size, const double *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const double *input_addr, Complex *output_addr, cudaStream_t stream); - -template void Cast(const int input_size, const bool *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, Complex *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const bool *input_addr, Complex *output_addr, cudaStream_t stream); - -template void Cast(const int input_size, const Complex *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, Complex *output_addr, - cudaStream_t stream); - -template void Cast(const int input_size, const Complex *input_addr, int8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, int16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, int32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, int64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, uint8_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, uint16_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, uint32_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, uint64_t *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, float *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, double *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, half *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, bool *output_addr, cudaStream_t stream); -template void Cast(const int input_size, const Complex *input_addr, Complex *output_addr, - cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh deleted file mode 100644 index c7eab4b0a81..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CLIP_GRAD_NORM_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CLIP_GRAD_NORM_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void ScalingGradOp(const size_t size, const T *x, const float *scaling_factor, float *scaling_out_addr, - cudaStream_t cuda_stream); - -template -void ClipGradNormOp(const size_t size, const float *x, const T *clip_norm, const float *reduce_sum_value, - float *output_addr, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CLIP_GRAD_NORM_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cu deleted file mode 100755 index ca409ec126d..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cu +++ /dev/null @@ -1,92 +0,0 @@ -/** - * Copyright 2019-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include "plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh" -template -__global__ void Concat(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis, - int *len_axis, T **inputs, T *output) { - for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { - int num = pos % all_size_before_axis / all_size_axis; - int block = -1; - int axis_inc = 0; - int block_len = 0; - for (int i = 0; i < input_num; i++) { - if (axis_inc <= num) { - block++; - axis_inc += len_axis[i]; - } else { - break; - } - } - block_len = len_axis[block]; - axis_inc -= len_axis[block]; - int block_pos = - pos / all_size_before_axis * block_len * all_size_axis + (num - axis_inc) * all_size_axis + pos % all_size_axis; - output[pos] = inputs[block][block_pos]; - } - return; -} - -template -void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis, - int *len_axis, T **inputs, T *output, cudaStream_t cuda_stream) { - Concat<<>>(size, input_num, all_size_before_axis, all_size_axis, - len_axis, inputs, output); - return; -} - -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, double **inputs, double *output, - cudaStream_t cuda_stream); -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, float **inputs, float *output, - cudaStream_t cuda_stream); -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, half **inputs, half *output, - cudaStream_t cuda_stream); - -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, int64_t **inputs, int64_t *output, - cudaStream_t cuda_stream); -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, int **inputs, int *output, - cudaStream_t cuda_stream); -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, short **inputs, short *output, // NOLINT - cudaStream_t cuda_stream); -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, char **inputs, char *output, - cudaStream_t cuda_stream); - -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, uint64_t **inputs, uint64_t *output, - cudaStream_t cuda_stream); -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, uint32_t **inputs, uint32_t *output, - cudaStream_t cuda_stream); -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, uint16_t **inputs, uint16_t *output, - cudaStream_t cuda_stream); -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, unsigned char **inputs, unsigned char *output, - cudaStream_t cuda_stream); - -template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, - const int all_size_axis, int *len_axis, bool **inputs, bool *output, - cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cuh deleted file mode 100644 index 17d8ba82723..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cuh +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void ConvertGradient(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth, - const size_t width, T *input_addr, T *outt_addr, cudaStream_t cuda_stream); - -template -void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth, - const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream); - -template -void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t ori_h, - const size_t ori_w, const size_t batchwidth, const size_t width, T *input_addr, T *output_addr, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh deleted file mode 100644 index 176c063dc8e..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CORRECTIONMUL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CORRECTIONMUL_H_ - -template -void CalCorrectionMul(const T* weight, const T* gamma, const T* running_std, int batch_size, int channel_size, - int height, int width, T* output, cudaStream_t cuda_stream); - -template -void CalCorrectionMulGrad(const T* d_out, const T* weight, const T* running_std, int batch_size, int channel_size, - int height, int width, T* d_gamma, T* tmp, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CORRECTIONMUL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh deleted file mode 100644 index c3f31f29940..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CROP_AND_RESIZE_IMPL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CROP_AND_RESIZE_IMPL_H_ -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void CalCropAndResize(const size_t size, const T *input_image, float *input_boxes, int *input_box_index, int batch, - int input_height, int input_width, int final_height, int final_width, int channel, - int method, float extrapol_val, float *output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CROP_AND_RESIZE_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh deleted file mode 100644 index 2440135fe42..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CROSSENTROPY_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CROSSENTROPY_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -// The batch size limit to judge whether to use multiple threads. -constexpr int kLargeBatchLowLimit = 32768; - -template -void CrossEntropyWithSparse(const T *logits, const S *labels, const size_t batch_size, const size_t class_num, T *loss, - cudaStream_t cuda_stream); - -template -void CrossEntropyGradWithSparse(const T *logits, const S *labels, const size_t batch_size, const size_t class_num, - T *grad, cudaStream_t cuda_stream); - -template -void CrossEntropy(const T *logits, const S *labels, const size_t batch_size, const size_t class_num, T *losses, - T *dlogits, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CROSSENTROPY_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cuh deleted file mode 100644 index 7e155ced56e..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cuh +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH - -template -void CalculateFwdVar(T *log_alpha_b, int *label_value_with_blank, T *softmax_probs, const int *sequence_length, - bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, int blank, int *label_squence_length, - int *cum_labels_length, bool ignore_longer_outputs_than_inputs, cudaStream_t stream); - -template -void CalculateBwdVar(T *log_beta_b, int *label_value_with_blank, T *softmax_probs, const int *sequence_length, - bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, int blank, int *label_squence_length, - int *cum_labels_length, bool ignore_longer_outputs_than_inputs, cudaStream_t stream); - -template -void InnerSoftMax(const T *probs, T *softmax_cost, const int *sequence_length, int max_time, int batch, int numclass, - cudaStream_t stream); - -void GenLabelValuePCR(int *label_value_sp, int *label_value_pcr, int *label_squence_length, int *cum_labels_length, - int *max_labels_length, int batch, cudaStream_t stream); - -void GenLabelWithBlank(int *label_value, int *label_value_with_blank, int *label_squence_length, - int *precum_labels_length, int *cum_labels_length, int batch, int blank, cudaStream_t stream); - -void GenLabelValue(int *label_value_sp, const int64_t *label_indices, const int *label_values, - int *label_squence_length, int *cum_labels_length, int *max_labels_length, int size, int blank, - int batch, cudaStream_t stream); - -void CalculatePreLength(int *label_squence_length, int *precum_labels_length, int *cum_labels_length, - int *max_labels_length, const int64_t *label_indices, int batch, int size, cudaStream_t stream); -void CalculateMaxSequence(const int *sequence_length, int *max_labels_length, int batch, cudaStream_t stream); -template -void CTCLoss(T *log_alpha_b, T *log_beta_b, T *softmax_probs, int *label_value_with_blank, int batch, int SOffSet, - int maxtime, int numclass, const int *sequence_length, int *label_squence_length, int *cum_labels_length, - T *cost, T *grads, T *prob_num, bool ignore_longer_outputs_than_inputs, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt new file mode 100644 index 00000000000..d81c38e431b --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt @@ -0,0 +1,27 @@ +file(GLOB_RECURSE CUDA_OPS_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cu") + +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-delete-non-abstract-non-virtual-dtor -Wno-overloaded-virtual") +endif() + +if(${CUDA_VERSION} VERSION_LESS 11.0) + string(REPLACE "-std=c++17" "-std=c++11" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +else() + string(REPLACE "-std=c++17" "-std=c++14" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +endif() + +set_property(SOURCE ${CUDA_OPS_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_KERNEL) +if(ENABLE_GPU) + add_library(cuda_common_obj OBJECT cuda_common.cc) + target_compile_options(cuda_common_obj PRIVATE "-std=c++17") + cuda_add_library(cuda_ops SHARED ${CUDA_OPS_SRC_LIST} $) + message("add gpu lib to cuda_ops") + target_link_libraries(cuda_ops mindspore_core + ${CUDA_PATH}/lib64/libcurand.so + ${CUDNN_LIBRARY_PATH} + ${CUDA_PATH}/lib64/libcudart.so + ${CUDA_PATH}/lib64/stubs/libcuda.so + ${CUDA_PATH}/lib64/libcusolver.so + ${CUDA_PATH}/lib64/libcufft.so + ${CUDA_PATH}/lib64/libcublas.so) +endif() \ No newline at end of file diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cu similarity index 55% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cu index 0680867b722..765b73ac229 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh" +#include "include/cuda_fp16.h" template __device__ __forceinline__ T SqrtFunc(T input) { @@ -113,50 +114,50 @@ void ApplyAdagrad(const size_t size, size, update_slots, learning_rate, gradient, variable, accumulation); } -template void ApplyAdagrad(const size_t size, - const bool update_slots, - const float *learning_rate, - const float *gradient, - float *variable, - float *accumulation, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdagrad(const size_t size, + const bool update_slots, + const float *learning_rate, + const float *gradient, + float *variable, + float *accumulation, + cudaStream_t cuda_stream); -template void ApplyAdagrad(const size_t size, - const bool update_slots, - const half *learning_rate, - const half *gradient, - half *variable, - half *accumulation, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdagrad(const size_t size, + const bool update_slots, + const half *learning_rate, + const half *gradient, + half *variable, + half *accumulation, + cudaStream_t cuda_stream); -template void ApplyAdagrad(const size_t size, - const bool update_slots, - const float *learning_rate, - const half *gradient, - half *variable, - half *accumulation, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdagrad(const size_t size, + const bool update_slots, + const float *learning_rate, + const half *gradient, + half *variable, + half *accumulation, + cudaStream_t cuda_stream); -template void ApplyAdagrad(const size_t size, - const bool update_slots, - const float *learning_rate, - const half *gradient, - float *variable, - float *accumulation, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdagrad(const size_t size, + const bool update_slots, + const float *learning_rate, + const half *gradient, + float *variable, + float *accumulation, + cudaStream_t cuda_stream); -template void ApplyAdagrad(const size_t size, - const bool update_slots, - const half *learning_rate, - const float *gradient, - float *variable, - float *accumulation, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdagrad(const size_t size, + const bool update_slots, + const half *learning_rate, + const float *gradient, + float *variable, + float *accumulation, + cudaStream_t cuda_stream); -template void ApplyAdagrad(const size_t size, - const bool update_slots, - const float *learning_rate, - const float *gradient, - half *variable, - half *accumulation, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdagrad(const size_t size, + const bool update_slots, + const float *learning_rate, + const float *gradient, + half *variable, + half *accumulation, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh new file mode 100644 index 00000000000..7f73af3ef57 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh @@ -0,0 +1,29 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAGRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAGRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void ApplyAdagrad(const size_t size, + const bool update_slots, + const S *learning_rate, + const G *gradient, + T *variable, + T *accumulation, + cudaStream_t stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAGRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cu similarity index 69% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cu index 8c3859dbefd..e2bd92c3264 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh" +#include "include/cuda_fp16.h" template __device__ __forceinline__ T SqrtFunc(T input) { @@ -82,16 +83,19 @@ void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learni epsilon, decay, variable, m, v); } -template void ApplyAdam(const size_t size, const float *gradient, const float *beta1_power, - const float *beta2_power, const float *learning_rate, const float *beta1, - const float *beta2, const float *epsilon, float *variable, float *m, float *v, - cudaStream_t cuda_stream); -template void ApplyAdam(const size_t size, const half *gradient, const half *beta1_power, const half *beta2_power, - const half *learning_rate, const half *beta1, const half *beta2, const half *epsilon, - half *variable, half *m, half *v, cudaStream_t cuda_stream); -template void AdamWeightDecayOp(const size_t size, const float *gradient, const float *learning_rate, - const float *beta1, const float *beta2, const float *epsilon, const float *decay, - float *variable, float *m, float *v, cudaStream_t cuda_stream); -template void AdamWeightDecayOp(const size_t size, const half *gradient, const float *learning_rate, - const float *beta1, const float *beta2, const float *epsilon, const float *decay, - half *variable, half *m, half *v, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdam(const size_t size, const float *gradient, const float *beta1_power, + const float *beta2_power, const float *learning_rate, const float *beta1, + const float *beta2, const float *epsilon, float *variable, float *m, + float *v, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdam(const size_t size, const half *gradient, const half *beta1_power, + const half *beta2_power, const half *learning_rate, const half *beta1, + const half *beta2, const half *epsilon, half *variable, half *m, half *v, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AdamWeightDecayOp(const size_t size, const float *gradient, + const float *learning_rate, const float *beta1, + const float *beta2, const float *epsilon, const float *decay, + float *variable, float *m, float *v, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AdamWeightDecayOp(const size_t size, const half *gradient, + const float *learning_rate, const float *beta1, + const float *beta2, const float *epsilon, const float *decay, + half *variable, half *m, half *v, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh new file mode 100644 index 00000000000..ab42e3f250d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh @@ -0,0 +1,29 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void ApplyAdam(const size_t size, const T *gradient, const T *beta1_power, const T *beta2_power, + const T *learning_rate, const T *beta1, const T *beta2, const T *epsilon, T *variable, + T *m, T *v, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learning_rate, + const float *beta1, const float *beta2, const float *epsilon, const float *decay, + T *variable, T *m, T *v, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cu similarity index 80% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cu index 7eae29a155b..21425267fba 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cu @@ -15,7 +15,6 @@ */ #include "adam_weight_decay_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" template __global__ void AdamWeightDecayKernel(const int element_num_, const bool need_decay, const float *beta1, @@ -44,7 +43,8 @@ void AdamWeightDecay(const int &element_num_, const bool &need_decay, const floa gradient); } -template void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1, - const float *one_sub_beta1, const float *beta2, const float *one_sub_beta2, - const float *epsilon, const float *lr, const float *weight_decay, float *m, float *v, - float *param, float *gradient, cudaStream_t stream); +template CUDA_LIB_EXPORT void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1, + const float *one_sub_beta1, const float *beta2, + const float *one_sub_beta2, const float *epsilon, const float *lr, + const float *weight_decay, float *m, float *v, float *param, + float *gradient, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cuh new file mode 100644 index 00000000000..7e52876a3e5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_WEIGHT_DECAY_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_WEIGHT_DECAY_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1, + const float *one_sub_beta1, const float *beta2, const float *one_sub_beta2, + const float *epsilon, const float *lr, const float *weight_decay, T *m, T *v, + T *param, T *gradient, cudaStream_t stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_WEIGHT_DECAY_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cu similarity index 82% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cu index 0b3c3f92646..275cda84ecf 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh" +#include "include/cuda_fp16.h" __device__ inline uint start_index(uint a, uint b, uint c) { return floorf(__uint2float_rn(a * c) / __uint2float_rn(b)); @@ -168,14 +169,17 @@ void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const size, input_height, input_width, output_height, output_width, input_data, output_data); } -template void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const uint input_width, - const uint output_height, const uint output_width, float *input_data, - float *output_data, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, + const uint input_width, const uint output_height, + const uint output_width, float *input_data, + float *output_data, cudaStream_t cuda_stream); -template void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const uint input_width, - const uint output_height, const uint output_width, half *input_data, - half *output_data, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, + const uint input_width, const uint output_height, + const uint output_width, half *input_data, + half *output_data, cudaStream_t cuda_stream); -template void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const uint input_width, - const uint output_height, const uint output_width, double *input_data, - double *output_data, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, + const uint input_width, const uint output_height, + const uint output_width, double *input_data, + double *output_data, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh new file mode 100644 index 00000000000..d9bb7c22aea --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh @@ -0,0 +1,25 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const uint input_width, + const uint output_height, const uint output_width, T *input_data, + T *output_data, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cu similarity index 81% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cu index e93af42a28f..11cb4b1cfa4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh" +#include "include/cuda_fp16.h" __device__ inline uint start_index(uint a, uint b, uint c) { return floorf(__uint2float_rn(a * c) / __uint2float_rn(b)); @@ -155,14 +156,17 @@ void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint size, input_height, input_width, output_height, output_width, input_data, output_data); } -template void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint input_width, - const uint output_height, const uint output_width, float *input_data, - float *output_data, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, + const uint input_width, const uint output_height, + const uint output_width, float *input_data, + float *output_data, cudaStream_t cuda_stream); -template void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint input_width, - const uint output_height, const uint output_width, half *input_data, - half *output_data, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, + const uint input_width, const uint output_height, + const uint output_width, half *input_data, + half *output_data, cudaStream_t cuda_stream); -template void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint input_width, - const uint output_height, const uint output_width, double *input_data, - double *output_data, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, + const uint input_width, const uint output_height, + const uint output_width, double *input_data, + double *output_data, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh new file mode 100644 index 00000000000..b6ea4787f93 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh @@ -0,0 +1,25 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint input_width, + const uint output_height, const uint output_width, T *input_data, + T *output_data, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_impl.cuh similarity index 51% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_impl.cuh index a01a40443be..2eb9ac5ede7 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_impl.cuh @@ -14,12 +14,15 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask, + cudaStream_t cuda_stream); -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void SigmoidCrossEntropyWithLogits(const size_t size, const T *logits, const S *labels, T *outputs, +template +CUDA_LIB_EXPORT void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cu similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cu index e55e15ea6c3..d063133bcd7 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template __global__ void AddReluV2Kernel(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask) { @@ -49,20 +49,20 @@ void AddReluGradV2(const size_t num, const T *x1, const T *x2, const uint32_t *m AddReluGradV2Kernel<<>>(num, x1, x2, mask, dx); } -template void AddReluV2(const size_t num, const float *x1, const float *x2, float *y, uint32_t *mask, - cudaStream_t cuda_stream); -template void AddReluV2(const size_t num, const half *x1, const half *x2, half *y, uint32_t *mask, - cudaStream_t cuda_stream); -template void AddReluV2(const size_t num, const int32_t *x1, const int32_t *x2, int32_t *y, uint32_t *mask, - cudaStream_t cuda_stream); -template void AddReluV2(const size_t num, const int64_t *x1, const int64_t *x2, int64_t *y, uint32_t *mask, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const float *x1, const float *x2, float *y, uint32_t *mask, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const half *x1, const half *x2, half *y, uint32_t *mask, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const int32_t *x1, const int32_t *x2, int32_t *y, + uint32_t *mask, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const int64_t *x1, const int64_t *x2, int64_t *y, + uint32_t *mask, cudaStream_t cuda_stream); -template void AddReluGradV2(const size_t num, const float *x1, const float *x2, const uint32_t *mask, float *dx, - cudaStream_t cuda_stream); -template void AddReluGradV2(const size_t num, const half *x1, const half *x2, const uint32_t *mask, half *dx, - cudaStream_t cuda_stream); -template void AddReluGradV2(const size_t num, const int32_t *x1, const int32_t *x2, const uint32_t *mask, int32_t *dx, - cudaStream_t cuda_stream); -template void AddReluGradV2(const size_t num, const int64_t *x1, const int64_t *x2, const uint32_t *mask, int64_t *dx, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const float *x1, const float *x2, const uint32_t *mask, + float *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const half *x1, const half *x2, const uint32_t *mask, + half *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const int32_t *x1, const int32_t *x2, + const uint32_t *mask, int32_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const int64_t *x1, const int64_t *x2, + const uint32_t *mask, int64_t *dx, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh new file mode 100644 index 00000000000..19af5ceb0bf --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh @@ -0,0 +1,28 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_V2_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_V2_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask, + cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_V2_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cu similarity index 66% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cu index 4f3489bf2ed..37abea756be 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void ApplyGradientDescent(const size_t size, T *var, const T *alpha, const T *delta, T *output) { @@ -31,7 +32,8 @@ void CalApplyGradientDescent(const size_t &size, T *var, const T *alpha, const T ApplyGradientDescent<<>>(size, var, alpha, delta, output); } -template void CalApplyGradientDescent(const size_t &size, float *var, const float *alpha, const float *delta, - float *output, cudaStream_t cuda_stream); -template void CalApplyGradientDescent(const size_t &size, half *var, const half *alpha, const half *delta, - half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalApplyGradientDescent(const size_t &size, float *var, const float *alpha, + const float *delta, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalApplyGradientDescent(const size_t &size, half *var, const half *alpha, + const half *delta, half *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh similarity index 53% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh index 48d61d63b81..19bb5afcff8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh @@ -14,16 +14,13 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSIGMOID_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSIGMOID_IMPL_CUH_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_APPLY_GRADIENT_DESCENT_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_APPLY_GRADIENT_DESCENT_IMPL_CUH_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalHSigmoid(const size_t &size, const T *input, T *output, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalApplyGradientDescent(const size_t &size, T *var, const T *alpha, const T *delta, T *output, + cudaStream_t cuda_stream); -template -void CalHSigmoidGrad(const size_t &size, const T *dout, const T *x, T *output, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSIGMOID_IMPL_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_APPLY_GRADIENT_DESCENT_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cu similarity index 85% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cu index 7f7eb415bd8..9fceaa6e3f7 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cu @@ -15,8 +15,6 @@ */ #include "argmax_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "include/cuda_fp16.h" template __global__ void Argmax(const T *input, const S bound, const size_t outer_size, const size_t inner_size, S *output) { @@ -46,7 +44,9 @@ void CalArgmax(const T *input, const S bound, const size_t outer_size, const siz return; } -template void CalArgmax(const float *input, const int bound, const size_t outer_size, +template +CUDA_LIB_EXPORT void CalArgmax(const float *input, const int bound, const size_t outer_size, const size_t inner_size, int *output, cudaStream_t cuda_stream); -template void CalArgmax(const half *input, const int bound, const size_t outer_size, +template +CUDA_LIB_EXPORT void CalArgmax(const half *input, const int bound, const size_t outer_size, const size_t inner_size, int *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cuh new file mode 100755 index 00000000000..8cee0638634 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cuh @@ -0,0 +1,38 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ARGMAX_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ARGMAX_IMPL_CUH_ +#include "include/cuda_fp16.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +#ifdef __cplusplus +extern "C" { +#endif +CUDA_LIB_EXPORT void CalArgmaxFp32(const float *input, const int bound, const size_t outer_size, + const size_t inner_size, int *output, cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT void CalArgmaxFp16(const half *input, const int bound, const size_t outer_size, + const size_t inner_size, int *output, cudaStream_t cuda_stream); +#ifdef __cplusplus +} +#endif + +template +CUDA_LIB_EXPORT void CalArgmax(const T *input, const S bound, const size_t outer_size, const size_t inner_size, + S *output, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ARGMAX_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cu similarity index 60% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cu index 0be7e20137c..5953c5248ae 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cu @@ -15,8 +15,6 @@ */ #include "assign_add_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "include/cuda_fp16.h" template __global__ void AssignAdd(const size_t size, T* ref, const T* value, T* output) { for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { @@ -33,10 +31,11 @@ void CalAssignAdd(const size_t size, T* ref, const T* value, T* output, cudaStre return; } -template void CalAssignAdd(const size_t size, float* ref, const float* value, float* output, - cudaStream_t cuda_stream); -template void CalAssignAdd(const size_t size, half* ref, const half* value, half* output, - cudaStream_t cuda_stream); -template void CalAssignAdd(const size_t size, int* ref, const int* value, int* output, cudaStream_t cuda_stream); -template void CalAssignAdd(const size_t size, int64_t* ref, const int64_t* value, int64_t* output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalAssignAdd(const size_t size, float* ref, const float* value, float* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalAssignAdd(const size_t size, half* ref, const half* value, half* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalAssignAdd(const size_t size, int* ref, const int* value, int* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalAssignAdd(const size_t size, int64_t* ref, const int64_t* value, + int64_t* output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cuh similarity index 56% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cuh index b095384aced..d4dd0e64484 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cuh @@ -14,9 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ASSIGNADD_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ASSIGNADD_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ASSIGN_ADD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ASSIGN_ADD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#include "include/cuda_fp16.h" template -void CalAssignAdd(const size_t size, T* ref, const T* value, T* output, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalAssignAdd(const size_t size, T* ref, const T* value, T* output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ASSIGNADD_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ASSIGN_ADD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cu similarity index 72% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cu index 3ef856e00af..39ebbc88411 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cu @@ -109,10 +109,11 @@ void BatchNormFold2Forward(const T *x, const T *beta, const T *gamma, const T *b x, beta, gamma, batch_std, batch_mean, running_std, running_mean, global_step, y, freeze_bn, N, C, H, W); } -template void BatchNormFold2Forward(const float *x, const float *beta, const float *gamma, - const float *batch_std, const float *batch_mean, const float *running_std, - const float *running_mean, const int *global_step, float *y, int freeze_bn, - size_t N, size_t C, size_t H, size_t W, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void BatchNormFold2Forward(const float *x, const float *beta, const float *gamma, + const float *batch_std, const float *batch_mean, + const float *running_std, const float *running_mean, + const int *global_step, float *y, int freeze_bn, size_t N, + size_t C, size_t H, size_t W, cudaStream_t cuda_stream); template void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *reduce_x, T *tmp2, T *tmp_x, size_t N, @@ -124,9 +125,10 @@ void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *r BatchNormFold2GradReduce2<<>>(tmp, d_beta, tmp2, reduce_x, N, C); } -template void BatchNormFold2GradReduce(const float *dout, const float *x, float *d_beta, float *tmp, - float *reduce_x, float *tmp2, float *tmp_x, size_t N, size_t C, size_t H, - size_t W, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void BatchNormFold2GradReduce(const float *dout, const float *x, float *d_beta, + float *tmp, float *reduce_x, float *tmp2, float *tmp_x, + size_t N, size_t C, size_t H, size_t W, + cudaStream_t cuda_stream); template void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std, @@ -136,11 +138,12 @@ void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T d_beta, reduce_x, batch_mean, batch_std, running_mean, running_std, gamma, d_gamma, d_batch_mean, d_batch_std, C); } -template void CalBatchNormFold2GradNotFreeze(const float *d_beta, const float *reduce_x, const float *batch_mean, - const float *batch_std, const float *running_mean, - const float *running_std, const float *gamma, float *d_gamma, - float *d_batch_mean, float *d_batch_std, size_t C, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchNormFold2GradNotFreeze(const float *d_beta, const float *reduce_x, + const float *batch_mean, const float *batch_std, + const float *running_mean, const float *running_std, + const float *gamma, float *d_gamma, + float *d_batch_mean, float *d_batch_std, size_t C, + cudaStream_t cuda_stream); template void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std, @@ -152,11 +155,12 @@ void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *ba ThrustFillWith(d_batch_std, C, (T)0.f, cuda_stream); } -template void CalBatchNormFold2GradFreeze(const float *d_beta, const float *reduce_x, const float *batch_mean, - const float *batch_std, const float *running_mean, - const float *running_std, const float *gamma, float *d_gamma, - float *d_batch_mean, float *d_batch_std, size_t C, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchNormFold2GradFreeze(const float *d_beta, const float *reduce_x, + const float *batch_mean, const float *batch_std, + const float *running_mean, const float *running_std, + const float *gamma, float *d_gamma, + float *d_batch_mean, float *d_batch_std, size_t C, + cudaStream_t cuda_stream); template void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_std, T *d_x, size_t N, size_t C, size_t H, @@ -164,6 +168,7 @@ void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_st DxMul<<>>(N, C, H * W, batch_std, running_std, d_x); } -template void CalBatchNormFold2GradNotFreezeDxMul(const float *batch_std, const float *running_std, float *d_x, - size_t N, size_t C, size_t H, size_t W, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchNormFold2GradNotFreezeDxMul(const float *batch_std, + const float *running_std, float *d_x, + size_t N, size_t C, size_t H, size_t W, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh new file mode 100644 index 00000000000..955d18b7dd5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh @@ -0,0 +1,43 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD2_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD2_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void BatchNormFold2Forward(const T *x, const T *beta, const T *gamma, const T *batch_std, + const T *batch_mean, const T *running_std, const T *running_mean, + const int *global_step, T *y, int freeze_bn, size_t N, size_t C, size_t H, + size_t W, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, + const T *batch_std, const T *running_mean, const T *running_std, + const T *gamma, T *d_gamma, T *d_batch_mean, T *d_batch_std, + size_t C, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, + const T *batch_std, const T *running_mean, const T *running_std, + const T *gamma, T *d_gamma, T *d_batch_mean, T *d_batch_std, size_t C, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *reduce_x, T *tmp2, + T *tmp_x, size_t N, size_t C, size_t H, size_t W, + cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_std, T *d_x, size_t N, + size_t C, size_t H, size_t W, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD2_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cu similarity index 78% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cu index 54f1c11ab9f..1dafd8b87c6 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cu @@ -18,7 +18,6 @@ #include #include #include "batchnorm_fold_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" template __global__ void UpdateRunningStd(int channel_size, const double epsilon, T* running_std) { @@ -55,8 +54,8 @@ void CalUpdateRunningStd(int channel_size, double epsilon, T* running_std, cudaS return; } -template void CalUpdateRunningStd(int channel_size, double epsilon, float* running_std, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalUpdateRunningStd(int channel_size, double epsilon, float* running_std, + cudaStream_t cuda_stream); template void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream) { @@ -64,7 +63,7 @@ void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream) return; } -template void CalUpdateBatchStd(int channel_size, float* batch_std, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalUpdateBatchStd(int channel_size, float* batch_std, cudaStream_t cuda_stream); template void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T* x, const T* batch_mean, @@ -74,9 +73,10 @@ void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T* d_batch_mean, d_batch_std, x, batch_mean, batch_std, batch_size, channel_size, height, width, dx); } -template void CalBatchNormFoldGrad(const float* d_batch_mean, const float* d_batch_std, const float* x, - const float* batch_mean, const float* batch_std, int batch_size, - int channel_size, int height, int width, float* dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchNormFoldGrad(const float* d_batch_mean, const float* d_batch_std, + const float* x, const float* batch_mean, + const float* batch_std, int batch_size, int channel_size, + int height, int width, float* dx, cudaStream_t cuda_stream); template void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream) { @@ -84,5 +84,5 @@ void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream) { thrust::fill(thrust::cuda::par.on(cuda_stream), dev_ptr, dev_ptr + size, tofill); } -template void ThrustFillWith(float* array, int size, float tofill, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ThrustFillWith(float* array, int size, float tofill, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh new file mode 100755 index 00000000000..5c02d9aedc4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh @@ -0,0 +1,33 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void CalUpdateRunningStd(int channel_size, double epsilon, T* running_std, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T* x, const T* batch_mean, + const T* batch_std, int batch_size, int channel_size, int height, int width, + T* dx, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_FOLD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cu similarity index 82% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cu index dba71d8f693..f62541422c3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cu @@ -21,6 +21,7 @@ #include #include "batchnorm_grad_impl.cuh" #include "include/cuda_runtime.h" +#include "include/cuda_fp16.h" const int kWarpSize = 32; const int kBlockSize = 1024; @@ -111,10 +112,12 @@ void CalBatchNormGrad(T *x, T *dy, float *scale, float *save_mean, float *save_v epsilon, N, C, H, W); } -template void CalBatchNormGrad(float *x, float *dy, float *scale, float *save_mean, float *save_variance, - float *dx, float *bn_scale, float *bn_bias, double epsilon, int N, int C, int H, - int W, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchNormGrad(float *x, float *dy, float *scale, float *save_mean, + float *save_variance, float *dx, float *bn_scale, float *bn_bias, + double epsilon, int N, int C, int H, int W, + cudaStream_t cuda_stream); -template void CalBatchNormGrad(half *x, half *dy, float *scale, float *save_mean, float *save_variance, half *dx, - float *bn_scale, float *bn_bias, double epsilon, int N, int C, int H, int W, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchNormGrad(half *x, half *dy, float *scale, float *save_mean, + float *save_variance, half *dx, float *bn_scale, float *bn_bias, + double epsilon, int N, int C, int H, int W, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cuh new file mode 100644 index 00000000000..c7edd82b2e2 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cuh @@ -0,0 +1,24 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalBatchNormGrad(T *x, T *dy, float *scale, float *save_mean, float *save_variance, T *dx, + float *bn_scale, float *bn_bias, double epsilon, int N, int C, int H, int W, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHNORM_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cu new file mode 100644 index 00000000000..aaea0c0bc89 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cu @@ -0,0 +1,139 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "batchtospace_impl.cuh" +#include "include/cuda_fp16.h" + +template +__global__ void BatchToSpace(const size_t size, const T *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, const size_t block_num, + T *output) { + size_t temp_stride = 0; + size_t temp_pos = 0; + size_t idx_on = 0; + size_t idx_oc = 0; + size_t idx_oh = 0; + size_t idx_ow = 0; + size_t idx_in = 0; + size_t input_pos = 0; + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; + pos += blockDim.x * gridDim.x) { + temp_stride = oc * oh * ow; + idx_on = pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= oc; + idx_oc = temp_pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= oh; + idx_oh = temp_pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= ow; + idx_ow = temp_pos / temp_stride; + + idx_in = (((idx_oh + crop_up) % block_num) * block_num + ((idx_ow + crop_lft) % block_num)) * on + idx_on; + input_pos = idx_in * ic; + input_pos = (input_pos + idx_oc) * ih; + input_pos = (input_pos + ((idx_oh + crop_up) - (idx_in / (on * block_num))) / block_num) * iw; + input_pos = (input_pos + ((idx_ow + crop_lft) - ((idx_in / on) % block_num)) / block_num); + output[pos] = input[input_pos]; + } + return; +} + +template +void CalBatchToSpace(const size_t size, const T *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, const size_t block_num, + T *output, cudaStream_t cuda_stream) { + BatchToSpace<<>>( + size, input, in, ih, iw, ic, on, oh, ow, oc, crop_up, crop_dn, crop_lft, crop_rht, block_num, output); + return; +} + +template CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const float *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, + const size_t block_num, float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const half *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, + const size_t block_num, half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const int *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, const size_t block_num, + int *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const int64_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, + const size_t block_num, int64_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const int16_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, + const size_t block_num, int16_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const int8_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, + const size_t block_num, int8_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const uint8_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, + const size_t block_num, uint8_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const uint16_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, + const size_t block_num, uint16_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const uint32_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, + const size_t block_num, uint32_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const uint64_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, + const size_t block_num, uint64_t *output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh new file mode 100644 index 00000000000..47433e51d73 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHTOSPACE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHTOSPACE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalBatchToSpace(const size_t size, const T *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t crop_up, const size_t crop_dn, + const size_t crop_lft, const size_t crop_rht, const size_t block_num, + T *output, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BATCHTOSPACE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cu similarity index 76% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cu index 8a2d6b0edc8..58731e05175 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh" +#include "include/cuda_fp16.h" __device__ __forceinline__ size_t Index(const size_t &index, const size_t &dim) { return dim == 1 ? 0 : index; } @@ -114,15 +115,18 @@ void CalBCEWithLogitsLoss(const size_t input_size, const T *predict, const T *ta return; } -template void CalBCEWithLogitsLoss(const size_t input_size, const half *predict, const half *target, - const size_t *input_shape, const size_t shape_size, const half *weight, - const size_t *weight_shape, const bool weight_need_broadcast, - const half *pos_weight, const size_t *pos_weight_shape, - const bool pos_weight_need_broadcast, half *shape_broadcasted, half *output, - cudaStream_t cuda_stream); -template void CalBCEWithLogitsLoss(const size_t input_size, const float *predict, const float *target, - const size_t *input_shape, const size_t shape_size, const float *weight, - const size_t *weight_shape, const bool weight_need_broadcast, - const float *pos_weight, const size_t *pos_weight_shape, - const bool pos_weight_need_broadcast, float *shape_broadcasted, float *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBCEWithLogitsLoss(const size_t input_size, const half *predict, + const half *target, const size_t *input_shape, + const size_t shape_size, const half *weight, + const size_t *weight_shape, const bool weight_need_broadcast, + const half *pos_weight, const size_t *pos_weight_shape, + const bool pos_weight_need_broadcast, half *shape_broadcasted, + half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBCEWithLogitsLoss(const size_t input_size, const float *predict, + const float *target, const size_t *input_shape, + const size_t shape_size, const float *weight, + const size_t *weight_shape, const bool weight_need_broadcast, + const float *pos_weight, const size_t *pos_weight_shape, + const bool pos_weight_need_broadcast, + float *shape_broadcasted, float *output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh new file mode 100644 index 00000000000..531816f836e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh @@ -0,0 +1,30 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BCE_WITH_LOGITS_LOSS_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BCE_WITH_LOGITS_LOSS_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#define MAX_LOGITS_DIMENSION 8 + +template +CUDA_LIB_EXPORT void CalBCEWithLogitsLoss(const size_t input_size, const T *predict, const T *target, + const size_t *input_shape, const size_t shape_size, const T *weight, + const size_t *weight_shape, const bool weight_need_broadcast, + const T *pos_weight, const size_t *pos_weight_shape, + const bool pos_weight_need_broadcast, T *shape_broadcasted, T *output, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BCE_WITH_LOGITS_LOSS_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cu similarity index 86% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cu index 5ae02f0198a..53a1c8401d9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cu @@ -17,9 +17,8 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh" const int kWarpSize = 32; // tuning param, for those nhw >= kLargeSize, launch more blocks to solve @@ -165,11 +164,13 @@ void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, return; } -template void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width, - const float* dy, float* db, cudaStream_t cuda_stream); -template void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width, - const half* dy, half* db, cudaStream_t cuda_stream); -template void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, - const float* dy, float* db, cudaStream_t cuda_stream); -template void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, const half* dy, - half* db, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, + const int height, const int width, + const float* dy, float* db, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, + const int height, const int width, + const half* dy, half* db, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, + const float* dy, float* db, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, const half* dy, + half* db, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh new file mode 100644 index 00000000000..b95060aa154 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BIAS_ADD_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BIAS_ADD_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalBiasAddGradNHWC(const size_t size, const size_t bias_size, + const T* dy, T* db, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width, + const T* dy, T* db, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BIAS_ADD_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cu similarity index 79% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cu index ef6e4575d0f..75191547168 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh" template __global__ void BoundingBoxDecodeKernel(const size_t size, const T *rois, const T *deltas, T *bboxes, const float m1, @@ -74,8 +74,11 @@ void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bbo ratio_clip); } -template void BoundingBoxDecode(const size_t size, const float *rois, const float *deltas, float *bboxes, - const float &m1, const float &m2, const float &m3, const float &m4, - const float &s1, const float &s2, const float &s3, const float &s4, - const int &max_height, const int &max_width, const float &ratio_clip, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void BoundingBoxDecode(const size_t size, const float *rois, const float *deltas, + float *bboxes, + const float &m1, const float &m2, + const float &m3, const float &m4, + const float &s1, const float &s2, + const float &s3, const float &s4, + const int &max_height, const int &max_width, + const float &ratio_clip, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh new file mode 100644 index 00000000000..bb887299582 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_DECODE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_DECODE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bboxes, + const float &m1, const float &m2, const float &m3, const float &m4, + const float &s1, const float &s2, const float &s3, const float &s4, + const int &max_height, const int &max_width, const float &ratio_clip, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_DECODE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cu similarity index 78% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cu index 155c7fe6936..927de52b0f0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh" template __global__ void BoundingBoxEncodeKernel(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas, @@ -56,7 +56,10 @@ void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtr m1, m2, m3, m4, s1, s2, s3, s4); } -template void BoundingBoxEncode(const size_t size, const float *anchor_box, const float *groundtruth_box, - float *deltas, const float &m1, const float &m2, const float &m3, - const float &m4, const float &s1, const float &s2, const float &s3, - const float &s4, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void BoundingBoxEncode(const size_t size, const float *anchor_box, + const float *groundtruth_box, float *deltas, + const float &m1, const float &m2, + const float &m3, const float &m4, + const float &s1, const float &s2, + const float &s3, const float &s4, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh new file mode 100644 index 00000000000..c7322f87c30 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_ENCODE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_ENCODE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas, + const float &m1, const float &m2, const float &m3, const float &m4, + const float &s1, const float &s2, const float &s3, const float &s4, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BOUNDINGBOX_ENCODE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cu similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cu index 2c751d7f438..47d2a4c6581 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cu @@ -14,9 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template struct MinimumGradFunc { @@ -113,37 +112,48 @@ void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, NoBroadcastGradKernel<<>>(nums, grad_x1, grad_x2, op, x1, x2, dy, dx1, dx2); } -template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, - const double *x1, const double *x2, const double *dy, double *dx1, double *dx2, - cudaStream_t stream); -template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, - const float *x1, const float *x2, const float *dy, float *dx1, float *dx2, - cudaStream_t stream); -template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, - const int *x1, const int *x2, const int *dy, int *dx1, int *dx2, cudaStream_t stream); -template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, - const half *x1, const half *x2, const half *dy, half *dx1, half *dx2, - cudaStream_t stream); -template void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, - const int64_t *x1, const int64_t *x2, const int64_t *dy, int64_t *dx1, int64_t *dx2, - cudaStream_t stream); -template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, - const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, - const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const double *x1, - const double *x2, const double *dy, double *dx1, double *dx2, cudaStream_t stream); -template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, - const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, - const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const float *x1, - const float *x2, const float *dy, float *dx1, float *dx2, cudaStream_t stream); -template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, - const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, - const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const int *x1, - const int *x2, const int *dy, int *dx1, int *dx2, cudaStream_t stream); -template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, - const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, - const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const half *x1, - const half *x2, const half *dy, half *dx1, half *dx2, cudaStream_t stream); -template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, - const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, - const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const int64_t *x1, - const int64_t *x2, const int64_t *dy, int64_t *dx1, int64_t *dx2, cudaStream_t stream); +template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, + enum BroadcastGradOpType op, const double *x1, const double *x2, + const double *dy, double *dx1, double *dx2, cudaStream_t stream); +template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, + enum BroadcastGradOpType op, const float *x1, const float *x2, + const float *dy, float *dx1, float *dx2, cudaStream_t stream); +template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, + enum BroadcastGradOpType op, const int *x1, const int *x2, + const int *dy, int *dx1, int *dx2, cudaStream_t stream); +template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, + enum BroadcastGradOpType op, const half *x1, const half *x2, + const half *dy, half *dx1, half *dx2, cudaStream_t stream); +template CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, + enum BroadcastGradOpType op, const int64_t *x1, const int64_t *x2, + const int64_t *dy, int64_t *dx1, int64_t *dx2, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, + const int &r0, const int &r1, const int &r2, const int &r3, + const int &d0, const int &d1, const int &d2, const int &d3, + const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, + const double *x1, const double *x2, const double *dy, + double *dx1, double *dx2, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, + const int &r0, const int &r1, const int &r2, const int &r3, + const int &d0, const int &d1, const int &d2, const int &d3, + const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, + const float *x1, const float *x2, const float *dy, float *dx1, float *dx2, + cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, + const int &r0, const int &r1, const int &r2, const int &r3, + const int &d0, const int &d1, const int &d2, const int &d3, + const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, + const int *x1, const int *x2, const int *dy, int *dx1, int *dx2, + cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, + const int &r0, const int &r1, const int &r2, const int &r3, + const int &d0, const int &d1, const int &d2, const int &d3, + const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, + const half *x1, const half *x2, const half *dy, half *dx1, half *dx2, + cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, + const int &r0, const int &r1, const int &r2, const int &r3, + const int &d0, const int &d1, const int &d2, const int &d3, + const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, + const int64_t *x1, const int64_t *x2, const int64_t *dy, + int64_t *dx1, int64_t *dx2, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh new file mode 100644 index 00000000000..2935a56cf35 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh @@ -0,0 +1,39 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +enum BroadcastGradOpType { + BROADCAST_GRAD_TYPE_MAXIMUM = 0, + BROADCAST_GRAD_TYPE_MINIMUM = 1, + BROADCAST_GRAD_TYPE_INVALID = 0xffffffff, +}; + +template +CUDA_LIB_EXPORT void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, + const int &r0, const int &r1, const int &r2, const int &r3, + const int &d0, const int &d1, const int &d2, const int &d3, + const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, + const T *x1, const T *x2, const T *dy, T *dx1, T *dx2, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, + enum BroadcastGradOpType op, + const T *x1, const T *x2, const T *dy, T *dx1, T *dx2, cudaStream_t stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu similarity index 68% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu index d7b7314161f..bc1bef0b6e8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu @@ -16,9 +16,8 @@ #include #include - -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh" +#include "include/cuda_fp16.h" // Basic function template @@ -566,30 +565,30 @@ void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T * } } -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int *x0, const int *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int8_t *x0, const int8_t *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint8_t *x0, const uint8_t *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int64_t *x0, const int64_t *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int16_t *x0, const int16_t *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint16_t *x0, const uint16_t *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint32_t *x0, const uint32_t *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const uint64_t *x0, const uint64_t *x1, bool *y, - cudaStream_t stream); -template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const bool *x0, const bool *x1, bool *y, - cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const double *x0, const double *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const float *x0, const float *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const half *x0, const half *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const int *x0, const int *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const int8_t *x0, const int8_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const uint8_t *x0, const uint8_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const int64_t *x0, const int64_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const int16_t *x0, const int16_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const uint16_t *x0, const uint16_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const uint32_t *x0, const uint32_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const uint64_t *x0, const uint64_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, + const bool *x0, const bool *x1, bool *y, cudaStream_t stream); // Element-wise ArithMetic template __global__ void ElewiseArithKernel(const int nums, const T *x0, const T *x1, T *y) { @@ -703,46 +702,46 @@ void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const T1 *x0, return ElewiseArithComplexKernel(nums, op, x0, x1, y, stream); } -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1, double *y, - cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, float *y, - cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, half *y, - cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int *x0, const int *x1, int *y, - cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int8_t *x0, const int8_t *x1, int8_t *y, - cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint8_t *x0, const uint8_t *x1, uint8_t *y, - cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int64_t *x0, const int64_t *x1, int64_t *y, - cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int16_t *x0, const int16_t *x1, int16_t *y, - cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint16_t *x0, const uint16_t *x1, - uint16_t *y, cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint32_t *x0, const uint32_t *x1, - uint32_t *y, cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const uint64_t *x0, const uint64_t *x1, - uint64_t *y, cudaStream_t stream); -template void ElewiseArith(const int &nums, enum BroadcastOpType op, const bool *x0, const bool *x1, bool *y, - cudaStream_t stream); -template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex *x0, - const Complex *x1, Complex *y, cudaStream_t stream); -template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex *x0, const float *x1, - Complex *y, cudaStream_t stream); -template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0, const Complex *x1, - Complex *y, cudaStream_t stream); -template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex *x0, - const Complex *x1, Complex *y, cudaStream_t stream); -template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex *x0, const double *x1, - Complex *y, cudaStream_t stream); -template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0, const Complex *x1, - Complex *y, cudaStream_t stream); -template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, - Complex *y, cudaStream_t stream); -template void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1, - Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const double *x0, const double *x1, double *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const float *x0, const float *x1, float *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const half *x0, const half *x1, half *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const int *x0, const int *x1, int *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const int8_t *x0, const int8_t *x1, int8_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const uint8_t *x0, const uint8_t *x1, uint8_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const int64_t *x0, const int64_t *x1, int64_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const int16_t *x0, const int16_t *x1, int16_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const uint16_t *x0, const uint16_t *x1, uint16_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const uint32_t *x0, const uint32_t *x1, uint32_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const uint64_t *x0, const uint64_t *x1, uint64_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, + const bool *x0, const bool *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex *x0, + const Complex *x1, Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex *x0, + const float *x1, Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0, + const Complex *x1, Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex *x0, + const Complex *x1, Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const Complex *x0, + const double *x1, Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0, + const Complex *x1, Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const float *x0, + const float *x1, Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const double *x0, + const double *x1, Complex *y, cudaStream_t stream); // Broadcast comparison __device__ __forceinline__ size_t Index(const size_t &index, const size_t &dim) { return dim == 1 ? 0 : index; } @@ -836,42 +835,42 @@ void BroadcastCmp(const std::vector &x0_dims, const std::vector } } -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const double *x0, - const double *x1, bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const float *x0, const float *x1, - bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const half *x0, const half *x1, - bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const int *x0, const int *x1, - bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const int8_t *x0, - const int8_t *x1, bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const uint8_t *x0, - const uint8_t *x1, bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const int64_t *x0, - const int64_t *x1, bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const int16_t *x0, - const int16_t *x1, bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const uint16_t *x0, - const uint16_t *x1, bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const uint32_t *x0, - const uint32_t *x1, bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const uint64_t *x0, - const uint64_t *x1, bool *y, cudaStream_t stream); -template void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const bool *x0, const bool *x1, - bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const double *x0, + const double *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const float *x0, + const float *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const half *x0, + const half *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const int *x0, + const int *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const int8_t *x0, + const int8_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const uint8_t *x0, const uint8_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const int64_t *x0, const int64_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const int16_t *x0, const int16_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const uint16_t *x0, const uint16_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const uint32_t *x0, const uint32_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const uint64_t *x0, const uint64_t *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const bool *x0, + const bool *x1, bool *y, cudaStream_t stream); // Broadcast Arithmetic template __global__ void BroadcastArithKernel(const size_t l0, const size_t l1, const size_t l2, const size_t l3, @@ -1097,69 +1096,82 @@ void BroadcastComplexArith(const std::vector &x0_dims, const std::vector } } -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const double *x0, - const double *x1, double *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const float *x0, - const float *x1, float *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const half *x0, const half *x1, - half *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const int *x0, const int *x1, - int *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const int8_t *x0, - const int8_t *x1, int8_t *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const uint8_t *x0, - const uint8_t *x1, uint8_t *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const int64_t *x0, - const int64_t *x1, int64_t *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const int16_t *x0, - const int16_t *x1, int16_t *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const uint16_t *x0, - const uint16_t *x1, uint16_t *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const uint32_t *x0, - const uint32_t *x1, uint32_t *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const uint64_t *x0, - const uint64_t *x1, uint64_t *y, cudaStream_t stream); -template void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const bool *x0, const bool *x1, - bool *y, cudaStream_t stream); -template void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, - const Complex *x0, const Complex *x1, Complex *y, - cudaStream_t stream); -template void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, - const Complex *x0, const float *x1, Complex *y, cudaStream_t stream); -template void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const float *x0, - const Complex *x1, Complex *y, cudaStream_t stream); -template void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, - const Complex *x0, const Complex *x1, Complex *y, - cudaStream_t stream); -template void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, - const Complex *x0, const double *x1, Complex *y, - cudaStream_t stream); -template void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const double *x0, - const Complex *x1, Complex *y, cudaStream_t stream); -template void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const double *x0, - const double *x1, Complex *y, cudaStream_t stream); -template void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, - const std::vector &y_dims, enum BroadcastOpType op, const float *x0, - const float *x1, Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const double *x0, const double *x1, double *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const float *x0, const float *x1, float *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const half *x0, + const half *x1, half *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const int *x0, + const int *x1, int *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const int8_t *x0, const int8_t *x1, int8_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const uint8_t *x0, const uint8_t *x1, uint8_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const int64_t *x0, const int64_t *x1, int64_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const int16_t *x0, const int16_t *x1, int16_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const uint16_t *x0, const uint16_t *x1, uint16_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const uint32_t *x0, const uint32_t *x1, uint32_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const uint64_t *x0, const uint64_t *x1, uint64_t *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const bool *x0, + const bool *x1, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector &x0_dims, + const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const Complex *x0, const Complex *x1, + Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector &x0_dims, + const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const Complex *x0, const float *x1, Complex *y, + cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector &x0_dims, + const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const float *x0, const Complex *x1, Complex *y, + cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector &x0_dims, + const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const Complex *x0, const Complex *x1, + Complex *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector &x0_dims, + const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const Complex *x0, const double *x1, Complex *y, + cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector &x0_dims, + const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const double *x0, const Complex *x1, Complex *y, + cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector &x0_dims, + const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const double *x0, const double *x1, Complex *y, + cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector &x0_dims, + const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, + const float *x0, const float *x1, Complex *y, + cudaStream_t stream); // BroadcastTo template @@ -1186,24 +1198,24 @@ void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const siz output_addr); } -template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0, - const size_t &o1, const size_t &o2, const size_t &o3, const double *input_addr, - double *output_addr, cudaStream_t stream); -template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0, - const size_t &o1, const size_t &o2, const size_t &o3, const float *input_addr, - float *output_addr, cudaStream_t stream); -template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0, - const size_t &o1, const size_t &o2, const size_t &o3, const half *input_addr, - half *output_addr, cudaStream_t stream); -template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0, - const size_t &o1, const size_t &o2, const size_t &o3, const int16_t *input_addr, - int16_t *output_addr, cudaStream_t stream); -template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0, - const size_t &o1, const size_t &o2, const size_t &o3, const int32_t *input_addr, - int32_t *output_addr, cudaStream_t stream); -template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0, - const size_t &o1, const size_t &o2, const size_t &o3, const int64_t *input_addr, - int64_t *output_addr, cudaStream_t stream); -template void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0, - const size_t &o1, const size_t &o2, const size_t &o3, const bool *input_addr, - bool *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const double *input_addr, double *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const float *input_addr, float *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const half *input_addr, half *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const int16_t *input_addr, int16_t *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const int32_t *input_addr, int32_t *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const int64_t *input_addr, int64_t *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const bool *input_addr, bool *output_addr, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh new file mode 100644 index 00000000000..2dc973988e4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh @@ -0,0 +1,90 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" + +const float kFloatEplison = 1e-37; + +enum BroadcastOpType { + BROADCAST_TYPE_GREATER = 0, + BROADCAST_TYPE_LESS = 1, + BROADCAST_TYPE_MAXIMUM = 2, + BROADCAST_TYPE_MINIMUM = 3, + BROADCAST_TYPE_POWER = 4, + BROADCAST_TYPE_REALDIV = 5, + BROADCAST_TYPE_MUL = 6, + BROADCAST_TYPE_SUB = 7, + BROADCAST_TYPE_ADD = 8, + BROADCAST_TYPE_FLOORDIV = 9, + BROADCAST_TYPE_ABSGRAD = 10, + BROADCAST_TYPE_DIV = 11, + BROADCAST_TYPE_DIVNONAN = 12, + BROADCAST_TYPE_EQUAL = 13, + BROADCAST_TYPE_SQUARED_DIFFERENCE = 14, + BROADCAST_TYPE_MOD = 15, + BROADCAST_TYPE_FLOORMOD = 16, + BROADCAST_TYPE_ATAN2 = 17, + BROADCAST_TYPE_GREATER_EQUAL = 18, + BROADCAST_TYPE_LESS_EQUAL = 19, + BROADCAST_TYPE_NOT_EQUAL = 20, + BROADCAST_TYPE_LOGICAL_AND = 21, + BROADCAST_TYPE_LOGICAL_OR = 22, + BROADCAST_TYPE_TRUNCATEDIV = 23, + BROADCAST_TYPE_TRUNCATEMOD = 24, + BROADCAST_TYPE_COMPLEX = 25, + BROADCAST_TYPE_INVALID = 0xffffffff, +}; + +template +CUDA_LIB_EXPORT void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, + cudaStream_t stream); + +template +CUDA_LIB_EXPORT void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, + cudaStream_t stream); + +template +CUDA_LIB_EXPORT void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const T1 *x0, const T2 *x1, + Complex *y, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void BroadcastCmp(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const T *x0, + const T *x1, bool *y, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void BroadcastArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const T *x0, + const T *x1, T *y, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const T1 *x0, + const T2 *x1, Complex *y, cudaStream_t stream); +template +CUDA_LIB_EXPORT void BroadcastComplexArith(const std::vector &x0_dims, const std::vector &x1_dims, + const std::vector &y_dims, enum BroadcastOpType op, const T *x0, + const T *x1, Complex *y, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const T *input_addr, T *output_addr, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_BROADCAST_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cu similarity index 72% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cu index c4a759a5be0..9a383d77d32 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cu @@ -17,7 +17,8 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void CastAll(T** inputs, S** output, const size_t num, const size_t *size) { @@ -34,7 +35,7 @@ void CastAllKernel(T** inputs, S** output, const size_t max, const size_t num, c CastAll<<>>(inputs, output, num, size); return; } -template void CastAllKernel(half** inputs, float** output, const size_t max, const size_t num, - const size_t *size, cudaStream_t stream); -template void CastAllKernel(float** inputs, half** output, const size_t max, const size_t num, - const size_t *size, cudaStream_t stream); +template CUDA_LIB_EXPORT void CastAllKernel(half** inputs, float** output, const size_t max, const size_t num, + const size_t *size, cudaStream_t stream); +template CUDA_LIB_EXPORT void CastAllKernel(float** inputs, half** output, const size_t max, const size_t num, + const size_t *size, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh similarity index 56% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh index fd2ccc188a7..63303e792b1 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh @@ -14,11 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_ALL_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_ALL_IMPL_CUH_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CastAllKernel(T **inputs, S **output, const size_t max, const size_t num, const size_t *size, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_ +CUDA_LIB_EXPORT void CastAllKernel(T **inputs, S **output, const size_t max, const size_t num, const size_t *size, + cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_ALL_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cu new file mode 100644 index 00000000000..9d373a7c826 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cu @@ -0,0 +1,509 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh" +#include "include/cuda_fp16.h" + +// Generic cast +template +__device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) { + *output_addr = static_cast((*input_addr)); +} + +// half --> integer +__device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) { + *output_addr = __half2ull_rz((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) { + *output_addr = __half2ll_rz((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) { + *output_addr = __half2uint_rz((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) { + *output_addr = __half2int_rz((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const half *input_addr, uint16_t *output_addr) { + *output_addr = __half2ushort_rz((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const half *input_addr, int16_t *output_addr) { + *output_addr = __half2short_rz((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const half *input_addr, uint8_t *output_addr) { + *output_addr = static_cast(__half2ushort_rz((*input_addr))); +} + +__device__ __forceinline__ void CastBase(const half *input_addr, int8_t *output_addr) { + *output_addr = static_cast(__half2short_rz((*input_addr))); +} + +// integer --> half +__device__ __forceinline__ void CastBase(const uint64_t *input_addr, half *output_addr) { + *output_addr = __ull2half_rn((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const int64_t *input_addr, half *output_addr) { + *output_addr = __ll2half_rn((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const uint32_t *input_addr, half *output_addr) { + *output_addr = __uint2half_rn((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const int32_t *input_addr, half *output_addr) { + *output_addr = __int2half_rn((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const uint16_t *input_addr, half *output_addr) { + *output_addr = __ushort2half_rn((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const int16_t *input_addr, half *output_addr) { + *output_addr = __short2half_rn((*input_addr)); +} + +__device__ __forceinline__ void CastBase(const uint8_t *input_addr, half *output_addr) { + *output_addr = __ushort2half_rn(static_cast(*input_addr)); +} + +__device__ __forceinline__ void CastBase(const int8_t *input_addr, half *output_addr) { + *output_addr = __short2half_rn(static_cast(*input_addr)); +} + +// Cast +template +__global__ void CastKernel(const int input_size, const S *input_addr, T *output_addr) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < input_size; pos += blockDim.x * gridDim.x) { + CastBase(input_addr + pos, output_addr + pos); + } +} + +template +void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream) { + CastKernel<<>>(input_size, input_addr, output_addr); +} + +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int8_t *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int16_t *input_addr, Complex *output_addr, + cudaStream_t stream); + + +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int32_t *input_addr, Complex *output_addr, + cudaStream_t stream); + + +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const int64_t *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint8_t *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint16_t *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint32_t *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const uint64_t *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const half *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const float *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const double *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const bool *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, Complex *output_addr, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, int8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, int16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, int32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, int64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, uint8_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, uint16_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, uint32_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, uint64_t *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, float *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, double *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, half *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, bool *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Cast(const int input_size, const Complex *input_addr, Complex *output_addr, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh similarity index 57% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh index b4ef646dbe8..3fe6249e059 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh @@ -14,13 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_H_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_IMPL_CUH_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "utils/complex.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" template -void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_H_ +CUDA_LIB_EXPORT void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CAST_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cu similarity index 71% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cu index 339929581bc..b2d4fa25c4e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void CheckValidKernel(const size_t size, const T *box, const T *img_metas, S *valid) { @@ -58,11 +59,11 @@ void CheckValid(const size_t &size, const T *box, const T *img_metas, S *valid, CheckValidKernel<<>>(size, box, img_metas, valid); } -template void CheckValid(const size_t &size, const float *box, const float *img_metas, bool *valid, - cudaStream_t cuda_stream); -template void CheckValid(const size_t &size, const half *box, const half *img_metas, bool *valid, - cudaStream_t cuda_stream); -template void CheckValid(const size_t &size, const short *box, const short *img_metas, bool *valid, // NOLINT - cudaStream_t cuda_stream); -template void CheckValid(const size_t &size, const unsigned char *box, const unsigned char *img_metas, bool *valid, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CheckValid(const size_t &size, const float *box, const float *img_metas, bool *valid, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CheckValid(const size_t &size, const half *box, const half *img_metas, bool *valid, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CheckValid(const size_t &size, const short *box, const short *img_metas, bool *valid, // NOLINT + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CheckValid(const size_t &size, const unsigned char *box, const unsigned char *img_metas, + bool *valid, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cuh similarity index 56% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cuh index 8870ae5a2bc..36086eb1982 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cuh @@ -14,12 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CHECK_VALID_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CHECK_VALID_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CHECK_VALID_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CHECK_VALID_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CheckValid(const size_t &size, const T *box, const T *img_metas, S *valid, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CheckValid(const size_t &size, const T *box, const T *img_metas, S *valid, + cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CHECK_VALID_IMPL_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CHECK_VALID_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cu similarity index 72% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cu index c1974341ad4..954cc9f6629 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cuh" +#include "include/cuda_fp16.h" // The implement of ScalingGradOp template @@ -38,11 +39,11 @@ void ScalingGradOp(const size_t size, const T *x, const float *scaling_factor, f ScalingGradKernel<<>>(size, x, scaling_factor, scaling_out_addr); } -template void ScalingGradOp(const size_t size, const float *x, const float *scaling_factor, - float *scaling_out_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScalingGradOp(const size_t size, const float *x, const float *scaling_factor, + float *scaling_out_addr, cudaStream_t cuda_stream); -template void ScalingGradOp(const size_t size, const half *x, const float *scaling_factor, - float *scaling_out_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScalingGradOp(const size_t size, const half *x, const float *scaling_factor, + float *scaling_out_addr, cudaStream_t cuda_stream); // The implement of ClipGradNormOp template @@ -77,8 +78,10 @@ void ClipGradNormOp(const size_t size, const float *x, const T *clip_norm, const output_addr); } -template void ClipGradNormOp(const size_t size, const float *x, const float *clip_norm, - const float *reduce_sum_value, float *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ClipGradNormOp(const size_t size, const float *x, const float *clip_norm, + const float *reduce_sum_value, float *output_addr, + cudaStream_t cuda_stream); -template void ClipGradNormOp(const size_t size, const float *x, const half *clip_norm, - const float *reduce_sum_value, float *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ClipGradNormOp(const size_t size, const float *x, const half *clip_norm, + const float *reduce_sum_value, float *output_addr, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cuh new file mode 100644 index 00000000000..7faf8cb530c --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cuh @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CLIP_GRAD_NORM_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CLIP_GRAD_NORM_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void ScalingGradOp(const size_t size, const T *x, const float *scaling_factor, float *scaling_out_addr, + cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void ClipGradNormOp(const size_t size, const float *x, const T *clip_norm, + const float *reduce_sum_value, float *output_addr, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CLIP_GRAD_NORM_IMPL_CUH_ diff --git a/mindspore/ccsrc/utils/complex.h b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h similarity index 99% rename from mindspore/ccsrc/utils/complex.h rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h index cd83ebfd8b7..0779504dd38 100644 --- a/mindspore/ccsrc/utils/complex.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h @@ -16,12 +16,12 @@ #ifndef MINDSPORE_CCSRC_UTILS_COPLEX_H_ #define MINDSPORE_CCSRC_UTILS_COPLEX_H_ -#include -#include #ifdef ENABLE_GPU #include #include #endif +#include +#include #include "base/float16.h" #if defined(__CUDACC__) #define HOST_DEVICE __host__ __device__ @@ -32,8 +32,9 @@ namespace mindspore { namespace utils { // Implement Complex for mindspore, inspired by std::complex. +constexpr int T_SIZE = 2; template -struct alignas(sizeof(T) * 2) Complex { +struct alignas(sizeof(T) * T_SIZE) Complex { Complex() = default; ~Complex() = default; @@ -315,12 +316,9 @@ HOST_DEVICE inline T abs(const Complex &z) { template using Complex = mindspore::utils::Complex; - namespace std { - template class numeric_limits> : public numeric_limits {}; - } // namespace std #endif // MINDSPORE_CCSRC_UTILS_COPLEX_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cu new file mode 100755 index 00000000000..df9462df770 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cu @@ -0,0 +1,93 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh" +#include "include/cuda_fp16.h" +template +__global__ void Concat(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis, + int *len_axis, T **inputs, T *output) { + for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { + int num = pos % all_size_before_axis / all_size_axis; + int block = -1; + int axis_inc = 0; + int block_len = 0; + for (int i = 0; i < input_num; i++) { + if (axis_inc <= num) { + block++; + axis_inc += len_axis[i]; + } else { + break; + } + } + block_len = len_axis[block]; + axis_inc -= len_axis[block]; + int block_pos = + pos / all_size_before_axis * block_len * all_size_axis + (num - axis_inc) * all_size_axis + pos % all_size_axis; + output[pos] = inputs[block][block_pos]; + } + return; +} + +template +void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis, + int *len_axis, T **inputs, T *output, cudaStream_t cuda_stream) { + Concat<<>>(size, input_num, all_size_before_axis, all_size_axis, + len_axis, inputs, output); + return; +} + +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, double **inputs, double *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, float **inputs, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, half **inputs, half *output, + cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, int64_t **inputs, int64_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, int **inputs, int *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, short **inputs, short *output, // NOLINT + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, char **inputs, char *output, + cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, uint64_t **inputs, uint64_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, uint32_t **inputs, uint32_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, uint16_t **inputs, uint16_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, unsigned char **inputs, + unsigned char *output, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, bool **inputs, bool *output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh similarity index 51% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh index 8b65d14467c..b09486847ce 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh @@ -14,11 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CONCATV2_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CONCATV2_IMPL_CUH_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONCATV2_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONCATV2_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis, - int *len_axis, T **inputs, T *output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CONCATV2_IMPL_CUH_ +CUDA_LIB_EXPORT void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, + const int all_size_axis, int *len_axis, T **inputs, T *output, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONCATV2_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cu similarity index 79% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cu index c4bba2863c0..d1a9f81b549 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cu @@ -85,16 +85,18 @@ void ConvertGradientBack(const size_t size, const size_t height_h, const size_t size, height_h, height_w, ori_h, ori_w, batchwidth, width, input_addr, output_addr); } -template void ConvertGradient(const size_t size, const size_t height_h, const size_t height_w, - const size_t batchwidth, const size_t width, float *input_addr, float *output_addr, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConvertGradient(const size_t size, const size_t height_h, const size_t height_w, + const size_t batchwidth, const size_t width, float *input_addr, + float *output_addr, cudaStream_t cuda_stream); -template void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, - const size_t batchwidth, const size_t width, float *input_addr, - float *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConvertGradientBack(const size_t size, const size_t height_h, + const size_t height_w, const size_t batchwidth, + const size_t width, float *input_addr, + float *output_addr, cudaStream_t cuda_stream); -template void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, - const size_t ori_h, const size_t ori_w, const size_t batchwidth, - const size_t width, float *input_addr, float *output_addr, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ConvertGradientBack(const size_t size, const size_t height_h, + const size_t height_w, const size_t ori_h, + const size_t ori_w, const size_t batchwidth, + const size_t width, float *input_addr, float *output_addr, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cuh new file mode 100644 index 00000000000..8ddfc08db38 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cuh @@ -0,0 +1,35 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONVERT_GRADIENT_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONVERT_GRADIENT_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void ConvertGradient(const size_t size, const size_t height_h, const size_t height_w, + const size_t batchwidth, const size_t width, T *input_addr, T *outt_addr, + cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, + const size_t batchwidth, const size_t width, T *input_addr, T *output_addr, + cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, + const size_t ori_h, const size_t ori_w, const size_t batchwidth, + const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CONVERT_GRADIENT_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cu similarity index 79% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cu index b7a22dab73b..21146575b7e 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cu @@ -16,7 +16,6 @@ #include #include "correction_mul_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" template __global__ void CorrectionMul(const T* weight, const T* gamma, const T* running_std, const int batchsize, const int chw, @@ -52,8 +51,9 @@ void CalCorrectionMul(const T* weight, const T* gamma, const T* running_std, int output); } -template void CalCorrectionMul(const float* weight, const float* gamma, const float* running_std, int N, int C, - int H, int W, float* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCorrectionMul(const float* weight, const float* gamma, const float* running_std, + int N, int C, int H, int W, float* output, + cudaStream_t cuda_stream); template void CalCorrectionMulGrad(const T* d_out, const T* weight, const T* running_std, int N, int C, int H, int W, T* d_gamma, @@ -62,5 +62,6 @@ void CalCorrectionMulGrad(const T* d_out, const T* weight, const T* running_std, Reduce<<>>(N, C * H * W, tmp, running_std, d_gamma); } -template void CalCorrectionMulGrad(const float* d_out, const float* weight, const float* running_std, int N, - int C, int H, int W, float* d_gamma, float* tmp, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCorrectionMulGrad(const float* d_out, const float* weight, + const float* running_std, int N, int C, int H, int W, + float* d_gamma, float* tmp, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh new file mode 100644 index 00000000000..1d7b0a1a9ac --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh @@ -0,0 +1,29 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CORRECTION_MUL_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CORRECTION_MUL_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void CalCorrectionMul(const T* weight, const T* gamma, const T* running_std, int batch_size, + int channel_size, int height, int width, T* output, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CalCorrectionMulGrad(const T* d_out, const T* weight, const T* running_std, int batch_size, + int channel_size, int height, int width, T* d_gamma, T* tmp, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CORRECTION_MUL_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cu similarity index 53% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cu index c1b605c063f..98518f220fa 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cu @@ -16,7 +16,8 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh" +#include "include/cuda_fp16.h" // for every position, first calculate position it mirrors from in the new padded array // adjust calculated position to origin dx array dimensions and copy value @@ -110,39 +111,47 @@ void CalCropAndResize(const size_t size, const T *input_image, float *input_boxe return; } -template void CalCropAndResize(const size_t size, const int8_t *input_image, float *input_boxes, - int *input_box_index, int batch, int input_height, int input_width, - int final_height, int final_width, int channel, int method, - float extrapol_val, float *output, cudaStream_t cuda_stream); -template void CalCropAndResize(const size_t size, const int16_t *input_image, float *input_boxes, - int *input_box_index, int batch, int input_height, int input_width, - int final_height, int final_width, int channel, int method, - float extrapol_val, float *output, cudaStream_t cuda_stream); -template void CalCropAndResize(const size_t size, const int32_t *input_image, float *input_boxes, - int *input_box_index, int batch, int input_height, int input_width, - int final_height, int final_width, int channel, int method, - float extrapol_val, float *output, cudaStream_t cuda_stream); -template void CalCropAndResize(const size_t size, const int64_t *input_image, float *input_boxes, - int *input_box_index, int batch, int input_height, int input_width, - int final_height, int final_width, int channel, int method, - float extrapol_val, float *output, cudaStream_t cuda_stream); -template void CalCropAndResize(const size_t size, const half *input_image, float *input_boxes, - int *input_box_index, int batch, int input_height, int input_width, - int final_height, int final_width, int channel, int method, - float extrapol_val, float *output, cudaStream_t cuda_stream); -template void CalCropAndResize(const size_t size, const float *input_image, float *input_boxes, - int *input_box_index, int batch, int input_height, int input_width, - int final_height, int final_width, int channel, int method, - float extrapol_val, float *output, cudaStream_t cuda_stream); -template void CalCropAndResize(const size_t size, const double *input_image, float *input_boxes, - int *input_box_index, int batch, int input_height, int input_width, - int final_height, int final_width, int channel, int method, - float extrapol_val, float *output, cudaStream_t cuda_stream); -template void CalCropAndResize(const size_t size, const uint8_t *input_image, float *input_boxes, - int *input_box_index, int batch, int input_height, int input_width, - int final_height, int final_width, int channel, int method, - float extrapol_val, float *output, cudaStream_t cuda_stream); -template void CalCropAndResize(const size_t size, const uint16_t *input_image, float *input_boxes, - int *input_box_index, int batch, int input_height, int input_width, - int final_height, int final_width, int channel, int method, - float extrapol_val, float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const int8_t *input_image, float *input_boxes, + int *input_box_index, int batch, int input_height, + int input_width, int final_height, int final_width, int channel, + int method, float extrapol_val, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const int16_t *input_image, + float *input_boxes, int *input_box_index, int batch, + int input_height, int input_width, int final_height, + int final_width, int channel, int method, float extrapol_val, + float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const int32_t *input_image, + float *input_boxes, int *input_box_index, int batch, + int input_height, int input_width, int final_height, + int final_width, int channel, int method, float extrapol_val, + float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const int64_t *input_image, + float *input_boxes, int *input_box_index, int batch, + int input_height, int input_width, int final_height, + int final_width, int channel, int method, float extrapol_val, + float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const half *input_image, float *input_boxes, + int *input_box_index, int batch, int input_height, int input_width, + int final_height, int final_width, int channel, int method, + float extrapol_val, float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const float *input_image, float *input_boxes, + int *input_box_index, int batch, int input_height, + int input_width, int final_height, int final_width, int channel, + int method, float extrapol_val, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const double *input_image, float *input_boxes, + int *input_box_index, int batch, int input_height, + int input_width, int final_height, int final_width, int channel, + int method, float extrapol_val, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const uint8_t *input_image, + float *input_boxes, int *input_box_index, int batch, + int input_height, int input_width, int final_height, + int final_width, int channel, int method, + float extrapol_val, float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const uint16_t *input_image, + float *input_boxes, int *input_box_index, int batch, + int input_height, int input_width, int final_height, + int final_width, int channel, int method, float extrapol_val, + float *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh new file mode 100644 index 00000000000..498a037fdd5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROP_AND_RESIZE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROP_AND_RESIZE_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalCropAndResize(const size_t size, const T *input_image, float *input_boxes, int *input_box_index, + int batch, int input_height, int input_width, int final_height, int final_width, + int channel, int method, float extrapol_val, float *output, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROP_AND_RESIZE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cu similarity index 72% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cu index 6fcbec1d545..b414a2a54b1 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cu @@ -18,7 +18,7 @@ #include #include #include "cross_entropy_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template __global__ void CrossEntropyWithSparseKernel(const T *logits, const S *labels, const size_t batch_size, @@ -104,16 +104,19 @@ void CrossEntropy(const T *logits, const S *labels, const size_t batch_size, con epsilon, losses, dlogits); } -template void CrossEntropyWithSparse(const float *logits, const int *labels, const size_t batch_size, - const size_t class_num, float *loss, cudaStream_t cuda_stream); -template void CrossEntropyWithSparse(const float *logits, const int64_t *labels, - const size_t batch_size, const size_t class_num, float *loss, - cudaStream_t cuda_stream); -template void CrossEntropyGradWithSparse(const float *logits, const int *labels, const size_t batch_size, - const size_t class_num, float *grad, cudaStream_t cuda_stream); -template void CrossEntropyGradWithSparse(const float *logits, const int64_t *labels, - const size_t batch_size, const size_t class_num, float *grad, - cudaStream_t cuda_stream); -template void CrossEntropy(const float *logits, const float *labels, const size_t batch_size, - const size_t class_num, float *losses, float *dlogits, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CrossEntropyWithSparse(const float *logits, const int *labels, + const size_t batch_size, const size_t class_num, + float *loss, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CrossEntropyWithSparse(const float *logits, const int64_t *labels, + const size_t batch_size, const size_t class_num, + float *loss, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CrossEntropyGradWithSparse(const float *logits, const int *labels, + const size_t batch_size, const size_t class_num, + float *grad, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CrossEntropyGradWithSparse(const float *logits, const int64_t *labels, + const size_t batch_size, + const size_t class_num, float *grad, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CrossEntropy(const float *logits, const float *labels, + const size_t batch_size, const size_t class_num, float *losses, + float *dlogits, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cuh new file mode 100644 index 00000000000..21ef77f92f5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cuh @@ -0,0 +1,35 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROSS_ENTROPY_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROSS_ENTROPY_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +// The batch size limit to judge whether to use multiple threads. +constexpr int kLargeBatchLowLimit = 32768; + +template +CUDA_LIB_EXPORT void CrossEntropyWithSparse(const T *logits, const S *labels, const size_t batch_size, + const size_t class_num, T *loss, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CrossEntropyGradWithSparse(const T *logits, const S *labels, const size_t batch_size, + const size_t class_num, T *grad, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CrossEntropy(const T *logits, const S *labels, const size_t batch_size, const size_t class_num, + T *losses, T *dlogits, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CROSS_ENTROPY_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cu similarity index 89% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cu index e41030a7893..695cb1af5ca 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cu @@ -16,7 +16,6 @@ #include #include "ctcloss_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" template __device__ T LogSumExp(const T logprob1, const T logprob2) { if (logprob1 == logprob2 && logprob1 == -std::numeric_limits::infinity()) { @@ -427,20 +426,23 @@ void CTCLoss(T *log_alpha_b, T *log_beta_b, T *softmax_probs, int *label_value_w label_squence_length, cum_labels_length, cost, grads, prob_num, ignore_longer_outputs_than_inputs); } -template void CalculateFwdVar(float *log_alpha_b, int *label_value_with_blank, float *softmax_probs, - const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet, - int maxtime, int blank, int *label_squence_length, int *cum_labels_length, - bool ignore_longer_outputs_than_inputs, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalculateFwdVar(float *log_alpha_b, int *label_value_with_blank, + float *softmax_probs, const int *sequence_length, + bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, + int blank, int *label_squence_length, int *cum_labels_length, + bool ignore_longer_outputs_than_inputs, cudaStream_t stream); -template void CalculateBwdVar(float *log_beta_b, int *label_value_with_blank, float *softmax_probs, - const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet, - int maxtime, int blank, int *label_squence_length, int *cum_labels_length, - bool ignore_longer_outputs_than_inputs, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalculateBwdVar(float *log_beta_b, int *label_value_with_blank, + float *softmax_probs, const int *sequence_length, + bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, + int blank, int *label_squence_length, int *cum_labels_length, + bool ignore_longer_outputs_than_inputs, cudaStream_t stream); -template void InnerSoftMax(const float *probs, float *softmax_probs, const int *sequence_length, int max_time, - int batch, int numclass, cudaStream_t stream); +template CUDA_LIB_EXPORT void InnerSoftMax(const float *probs, float *softmax_probs, const int *sequence_length, + int max_time, int batch, int numclass, cudaStream_t stream); -template void CTCLoss(float *log_alpha_b, float *log_beta_b, float *softmax_probs, int *label_value_with_blank, - int batch, int SOffSet, int maxtime, int numclass, const int *sequence_length, - int *label_squence_length, int *cum_labels_length, float *cost, float *grads, - float *prob_num, bool ignore_longer_outputs_than_inputs, cudaStream_t stream); +template CUDA_LIB_EXPORT void CTCLoss(float *log_alpha_b, float *log_beta_b, float *softmax_probs, + int *label_value_with_blank, int batch, int SOffSet, int maxtime, + int numclass, const int *sequence_length, int *label_squence_length, + int *cum_labels_length, float *cost, float *grads, float *prob_num, + bool ignore_longer_outputs_than_inputs, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cuh new file mode 100644 index 00000000000..8e21b7e634d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cuh @@ -0,0 +1,59 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CTCLOSS_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CTCLOSS_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void CalculateFwdVar(T *log_alpha_b, int *label_value_with_blank, T *softmax_probs, + const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet, + int maxtime, int blank, int *label_squence_length, int *cum_labels_length, + bool ignore_longer_outputs_than_inputs, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void CalculateBwdVar(T *log_beta_b, int *label_value_with_blank, T *softmax_probs, + const int *sequence_length, bool ctc_merge_repeated, int batch, int SOffSet, + int maxtime, int blank, int *label_squence_length, int *cum_labels_length, + bool ignore_longer_outputs_than_inputs, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void InnerSoftMax(const T *probs, T *softmax_cost, const int *sequence_length, int max_time, int batch, + int numclass, cudaStream_t stream); + +CUDA_LIB_EXPORT void GenLabelValuePCR(int *label_value_sp, int *label_value_pcr, int *label_squence_length, + int *cum_labels_length, int *max_labels_length, int batch, cudaStream_t stream); + +CUDA_LIB_EXPORT void GenLabelWithBlank(int *label_value, int *label_value_with_blank, int *label_squence_length, + int *precum_labels_length, int *cum_labels_length, int batch, int blank, + cudaStream_t stream); + +CUDA_LIB_EXPORT void GenLabelValue(int *label_value_sp, const int64_t *label_indices, const int *label_values, + int *label_squence_length, int *cum_labels_length, int *max_labels_length, + int size, int blank, + int batch, cudaStream_t stream); + +CUDA_LIB_EXPORT void CalculatePreLength(int *label_squence_length, int *precum_labels_length, int *cum_labels_length, + int *max_labels_length, const int64_t *label_indices, int batch, int size, + cudaStream_t stream); +CUDA_LIB_EXPORT void CalculateMaxSequence(const int *sequence_length, int *max_labels_length, int batch, + cudaStream_t stream); +template +CUDA_LIB_EXPORT void CTCLoss(T *log_alpha_b, T *log_beta_b, T *softmax_probs, int *label_value_with_blank, int batch, + int SOffSet, int maxtime, int numclass, const int *sequence_length, + int *label_squence_length, int *cum_labels_length, T *cost, T *grads, T *prob_num, + bool ignore_longer_outputs_than_inputs, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CTCLOSS_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.cc new file mode 100644 index 00000000000..88c40cb6190 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.cc @@ -0,0 +1,40 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#include "utils/ms_context.h" + +namespace mindspore { +namespace device { +namespace gpu { +CudaCommon &CudaCommon::GetInstance() { + static CudaCommon instance; + return instance; +} + +CudaCommon::CudaCommon() { + uint32_t device_id = MsContext::GetInstance()->get_param(MS_CTX_DEVICE_ID); + cudaDeviceProp prop; + (void)cudaGetDeviceProperties(&prop, device_id); + threads_per_block_ = prop.maxThreadsPerBlock; + max_blocks_ = prop.multiProcessorCount; + major_sm_ = prop.major; + minor_sm_ = prop.minor; + max_share_memory_ = prop.sharedMemPerBlock; +} +} // namespace gpu +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_common.h b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h similarity index 78% rename from mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_common.h rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h index e0a172fdbd1..6e2ba9c96fe 100644 --- a/mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_common.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h @@ -14,12 +14,15 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_CUDA_COMMON_H_ -#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_CUDA_COMMON_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUDA_COMMON_H_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUDA_COMMON_H_ +#include +#include #include -#include "plugin/device/gpu/hal/device/gpu_device_manager.h" +#include +#define CUDA_LIB_EXPORT __attribute__((visibility("default"))) #define CUDA_KERNEL_ASSERT(cond) \ if (!(cond)) { \ __assert_fail(#cond, __FILE__, static_cast(__LINE__), __FUNCTION__); \ @@ -40,22 +43,10 @@ class CudaCommon { void set_check_sm(const bool &flag) { check_sm_ = flag; } bool check_sm() const { return check_sm_; } - static CudaCommon &GetInstance() { - static CudaCommon instance; - return instance; - } + static CudaCommon &GetInstance(); private: - CudaCommon() { - uint32_t device_id = GPUDeviceManager::GetInstance().cur_device_id(); - cudaDeviceProp prop; - (void)cudaGetDeviceProperties(&prop, device_id); - threads_per_block_ = prop.maxThreadsPerBlock; - max_blocks_ = prop.multiProcessorCount; - major_sm_ = prop.major; - minor_sm_ = prop.minor; - max_share_memory_ = prop.sharedMemPerBlock; - } + CudaCommon(); ~CudaCommon() = default; CudaCommon(const CudaCommon &) = delete; CudaCommon &operator=(const CudaCommon &) = delete; @@ -80,4 +71,4 @@ class CudaCommon { } // namespace device } // namespace mindspore -#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_CUDA_COMMON_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUDA_COMMON_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cu similarity index 72% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cu index a31fd2234bf..e881ee06266 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cu @@ -15,7 +15,7 @@ */ #include "cumprod_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "include/cuda_fp16.h" template __global__ void Copy(T *input, T *output, size_t size) { @@ -137,19 +137,21 @@ void CumProd(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, return; } -template void CumProd(const uint8_t *input, uint8_t *output, uint8_t *workspace, size_t dim0, size_t dim1, - size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_, - cudaStream_t stream); -template void CumProd(const int8_t *input, int8_t *output, int8_t *workspace, size_t dim0, size_t dim1, - size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_, - cudaStream_t stream); -template void CumProd(const int32_t *input, int32_t *output, int32_t *workspace, size_t dim0, size_t dim1, - size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_, - cudaStream_t stream); -template void CumProd(const double *input, double *output, double *workspace, size_t dim0, size_t dim1, - size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_, - cudaStream_t stream); -template void CumProd(const float *input, float *output, float *workspace, size_t dim0, size_t dim1, size_t dim2, - size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream); -template void CumProd(const half *input, half *output, half *workspace, size_t dim0, size_t dim1, size_t dim2, - size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumProd(const uint8_t *input, uint8_t *output, uint8_t *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumProd(const int8_t *input, int8_t *output, int8_t *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumProd(const int32_t *input, int32_t *output, int32_t *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumProd(const double *input, double *output, double *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumProd(const float *input, float *output, float *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumProd(const half *input, half *output, half *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cuh index 7998bc08912..8a08d82ebf5 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cuh @@ -14,9 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMPROD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMPROD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CumProd(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2, size_t stride, - size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_ +CUDA_LIB_EXPORT void CumProd(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2, + size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMPROD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cu similarity index 72% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cu index 84f148cf6bb..9727ecfd97b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cu @@ -15,7 +15,7 @@ */ #include "cumsum_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "include/cuda_fp16.h" template __global__ void Copy(T *input, T *output, size_t size) { @@ -137,19 +137,21 @@ void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, s return; } -template void CumSum(const uint8_t *input, uint8_t *output, uint8_t *workspace, size_t dim0, size_t dim1, - size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_, - cudaStream_t stream); -template void CumSum(const int8_t *input, int8_t *output, int8_t *workspace, size_t dim0, size_t dim1, - size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_, - cudaStream_t stream); -template void CumSum(const int32_t *input, int32_t *output, int32_t *workspace, size_t dim0, size_t dim1, - size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_, - cudaStream_t stream); -template void CumSum(const double *input, double *output, double *workspace, size_t dim0, size_t dim1, - size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_, - cudaStream_t stream); -template void CumSum(const float *input, float *output, float *workspace, size_t dim0, size_t dim1, size_t dim2, - size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream); -template void CumSum(const half *input, half *output, half *workspace, size_t dim0, size_t dim1, size_t dim2, - size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumSum(const uint8_t *input, uint8_t *output, uint8_t *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumSum(const int8_t *input, int8_t *output, int8_t *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumSum(const int32_t *input, int32_t *output, int32_t *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumSum(const double *input, double *output, double *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumSum(const float *input, float *output, float *workspace, + size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, + bool exclusive_, bool reverse_, cudaStream_t stream); +template CUDA_LIB_EXPORT void CumSum(const half *input, half *output, half *workspace, size_t dim0, size_t dim1, + size_t dim2, size_t stride, size_t stride2, bool exclusive_, bool reverse_, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cuh index 7e3c40c99ee..6d70ff1cbab 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cuh @@ -14,9 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMSUM_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMSUM_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2, size_t stride, - size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_ +CUDA_LIB_EXPORT void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2, + size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_CUMSUM_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cu new file mode 100644 index 00000000000..99b0e7ac923 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cu @@ -0,0 +1,138 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "depthtospace_impl.cuh" +#include "include/cuda_fp16.h" + +template +__global__ void DepthToSpace(const size_t size, const T *input, const size_t in, + const size_t ic, const size_t ih, const size_t iw, + const size_t on, const size_t oc, const size_t oh, + const size_t ow, const size_t r, T *output) { + size_t temp_stride = 0; + size_t temp_pos = 0; + size_t input_pos = 0; + size_t output_pos_array[DEPTHTOSPACE_BUFFER_DIMENSION]; + + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; + pos += blockDim.x * gridDim.x) { + temp_stride = oc * oh * ow; + output_pos_array[0] = pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= oc; + output_pos_array[1] = temp_pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= oh; + output_pos_array[2] = temp_pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= ow; + output_pos_array[3] = temp_pos / temp_stride; + + input_pos += output_pos_array[0]; + input_pos = + (input_pos * ic) + + (output_pos_array[1] + + (r * (output_pos_array[2] % r) + output_pos_array[3] % r) * oc); + input_pos = (input_pos * ih) + (output_pos_array[2] / r); + input_pos = (input_pos * iw) + (output_pos_array[3] / r); + + output[pos] = input[input_pos]; + input_pos = 0; + } + return; +} + +template +void CalDepthToSpace(const size_t size, const T *input, const size_t in, + const size_t ic, const size_t ih, const size_t iw, + const size_t on, const size_t oc, const size_t oh, + const size_t ow, const size_t r, T *output, + cudaStream_t cuda_stream) { + DepthToSpace<<>>( + size, input, in, ic, ih, iw, on, oc, oh, ow, r, output); + return; +} + +template CUDA_LIB_EXPORT void CalDepthToSpace(const size_t size, const float *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDepthToSpace(const size_t size, const half *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, half *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDepthToSpace(const size_t size, const int *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, int *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDepthToSpace(const size_t size, const int64_t *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, int64_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDepthToSpace(const size_t size, const int16_t *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, int16_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDepthToSpace(const size_t size, const int8_t *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, int8_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDepthToSpace(const size_t size, const uint8_t *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, uint8_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void +CalDepthToSpace(const size_t size, const uint16_t *input, + const size_t in, const size_t ic, const size_t ih, + const size_t iw, const size_t on, const size_t oc, + const size_t oh, const size_t ow, const size_t r, + uint16_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void +CalDepthToSpace(const size_t size, const uint32_t *input, + const size_t in, const size_t ic, const size_t ih, + const size_t iw, const size_t on, const size_t oc, + const size_t oh, const size_t ow, const size_t r, + uint32_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void +CalDepthToSpace(const size_t size, const uint64_t *input, + const size_t in, const size_t ic, const size_t ih, + const size_t iw, const size_t on, const size_t oc, + const size_t oh, const size_t ow, const size_t r, + uint64_t *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cuh new file mode 100644 index 00000000000..1978bf96519 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cuh @@ -0,0 +1,29 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DEPTHTOSPACE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DEPTHTOSPACE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +#define DEPTHTOSPACE_BUFFER_DIMENSION 4 +template +CUDA_LIB_EXPORT void CalDepthToSpace(const size_t size, const T *input, const size_t in, + const size_t ic, const size_t ih, const size_t iw, + const size_t on, const size_t oc, const size_t oh, + const size_t ow, const size_t r, T *output, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DEPTHTOSPACE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cu similarity index 77% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cu index bfe5741c294..c93f1e654c2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cu @@ -15,6 +15,7 @@ */ #include "determinant_triangle_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void DetTriangleKernel(T *input, T *output, size_t matrix_n_, size_t count) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { @@ -75,9 +76,11 @@ bool CheckTriangle(T *input, int fill_mode_, size_t matrix_n_, size_t count, cud return host_error_res; } -template void DetTriangle(float *input, float *output, size_t matrix_n_, size_t count, cudaStream_t cuda_stream); -template void DetTriangle(half *input, half *output, size_t matrix_n_, size_t count, cudaStream_t cuda_stream); -template bool CheckTriangle(float *input, int fill_mode_, size_t matrix_n_, size_t count, - cudaStream_t cuda_stream); -template bool CheckTriangle(half *input, int fill_mode_, size_t matrix_n_, size_t count, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void DetTriangle(float *input, float *output, size_t matrix_n_, size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void DetTriangle(half *input, half *output, size_t matrix_n_, size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT bool CheckTriangle(float *input, int fill_mode_, size_t matrix_n_, size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT bool CheckTriangle(half *input, int fill_mode_, size_t matrix_n_, size_t count, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cuh similarity index 53% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cuh index a1a4cc0803b..c992c98fef8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cuh @@ -14,14 +14,13 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DETERMINANT_TRIANGLE_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DETERMINANT_TRIANGLE_IMPL_H_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DETERMINANT_TRIANGLE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DETERMINANT_TRIANGLE_IMPL_CUH_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void DetTriangle(T *input, T *output, size_t matrix_n_, size_t count, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void DetTriangle(T *input, T *output, size_t matrix_n_, size_t count, cudaStream_t cuda_stream); template -bool CheckTriangle(T *input, int fill_mode_, size_t matrix_n_, size_t count, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DETERMINANT_TRIANGLE_IMPL_H_ +CUDA_LIB_EXPORT bool CheckTriangle(T *input, int fill_mode_, size_t matrix_n_, size_t count, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DETERMINANT_TRIANGLE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cu similarity index 58% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cu index 35decf9060b..debe823c39f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cu @@ -17,6 +17,7 @@ #include #include "dropout3d_impl.cuh" #include "include/cuda_runtime.h" +#include "include/cuda_fp16.h" template __global__ void Dropout3DForwardKernel(const T *input, bool *mask, T *output, float *rand_f, const size_t num_count, @@ -58,20 +59,21 @@ void Dropout3DForward(const T *input, bool *mask, T *output, float *rand_f, cons keep_prob, scale, num_per_chan); } -template void Dropout3DForward(const float *input, bool *mask, float *output, float *rand_f, - const size_t num_count, const float keep_prob, const size_t num_per_chan, - cudaStream_t cuda_stream); -template void Dropout3DForward(const half *input, bool *mask, half *output, float *rand_f, const size_t num_count, - const float keep_prob, const size_t num_per_chan, cudaStream_t cuda_stream); -template void Dropout3DForward(const int8_t *input, bool *mask, int8_t *output, float *rand_f, - const size_t num_count, const float keep_prob, const size_t num_per_chan, - cudaStream_t cuda_stream); -template void Dropout3DForward(const int16_t *input, bool *mask, int16_t *output, float *rand_f, - const size_t num_count, const float keep_prob, const size_t num_per_chan, - cudaStream_t cuda_stream); -template void Dropout3DForward(const int32_t *input, bool *mask, int32_t *output, float *rand_f, - const size_t num_count, const float keep_prob, const size_t num_per_chan, - cudaStream_t cuda_stream); -template void Dropout3DForward(const int64_t *input, bool *mask, int64_t *output, float *rand_f, - const size_t num_count, const float keep_prob, const size_t num_per_chan, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Dropout3DForward(const float *input, bool *mask, float *output, float *rand_f, + const size_t num_count, const float keep_prob, + const size_t num_per_chan, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Dropout3DForward(const half *input, bool *mask, half *output, float *rand_f, + const size_t num_count, const float keep_prob, + const size_t num_per_chan, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Dropout3DForward(const int8_t *input, bool *mask, int8_t *output, float *rand_f, + const size_t num_count, const float keep_prob, + const size_t num_per_chan, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Dropout3DForward(const int16_t *input, bool *mask, int16_t *output, + float *rand_f, const size_t num_count, const float keep_prob, + const size_t num_per_chan, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Dropout3DForward(const int32_t *input, bool *mask, int32_t *output, + float *rand_f, const size_t num_count, const float keep_prob, + const size_t num_per_chan, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Dropout3DForward(const int64_t *input, bool *mask, int64_t *output, + float *rand_f, const size_t num_count, const float keep_prob, + const size_t num_per_chan, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cuh index f1a50ce2ab7..75cd7d0fddd 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cuh @@ -14,13 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_CUDA_IMPL_DROPOUT3D_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_CUDA_IMPL_DROPOUT3D_IMPL_CUH_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT3D_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT3D_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void Dropout3DForward(const T *input, bool *mask, T *output, float *rand_f, const size_t num_count, - const float keep_prob, const size_t num_per_chan, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void Dropout3DForward(const T *input, bool *mask, T *output, float *rand_f, const size_t num_count, + const float keep_prob, const size_t num_per_chan, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_CUDA_IMPL_DROPOUT3D_IMPL_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT3D_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cu similarity index 76% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cu index d65f319ab73..e84f9cb8788 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cu @@ -17,6 +17,7 @@ #include #include "dropout_impl.cuh" #include "include/cuda_runtime.h" +#include "include/cuda_fp16.h" template __global__ void DropoutForwardKernel(const T *input, T *mask, T *output, float *mask_f, size_t num_count, float keep_prob) { @@ -65,11 +66,11 @@ void DropoutBackward(const T *dy, const T *mask, T *dx, size_t num_count, float DropoutBackwardKernel<<>>(dy, mask, dx, num_count, drop_prob); } -template void DropoutForward(const float *input, float *mask, float *output, float *mask_f, - size_t num_count, float drop_prob, cudaStream_t cuda_stream); -template void DropoutForward(const half *input, half *mask, half *output, float *mask_f, - size_t num_count, float drop_prob, cudaStream_t cuda_stream); -template void DropoutBackward(const float *dy, const float *mask, float *dx, size_t num_count, - float drop_prob, cudaStream_t cuda_stream); -template void DropoutBackward(const half *dy, const half *mask, half *dx, size_t num_count, - float drop_prob, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void DropoutForward(const float *input, float *mask, float *output, float *mask_f, + size_t num_count, float drop_prob, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void DropoutForward(const half *input, half *mask, half *output, float *mask_f, + size_t num_count, float drop_prob, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void DropoutBackward(const float *dy, const float *mask, float *dx, size_t num_count, + float drop_prob, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void DropoutBackward(const half *dy, const half *mask, half *dx, size_t num_count, + float drop_prob, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cuh new file mode 100644 index 00000000000..1518b2b2b48 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cuh @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void DropoutForward(const T *input, T *mask, T *output, float *mask_f, size_t num_count, + float keep_prob, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void DropoutBackward(const T *dy, const T *mask, T *dx, size_t num_count, float keep_prob, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DROPOUT_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cu new file mode 100644 index 00000000000..1e37f6b92fd --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cu @@ -0,0 +1,124 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "dynamic_range_impl.cuh" +#include + +template +__global__ void ValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta, + int64_t *output_shape, DynamicRangeErrorCode *error_code, + const int64_t max_output_size) { + T start = range_start[0]; + T end = range_end[0]; + T delta = range_delta[0]; + *error_code = DynamicRangeErrorCode::kOk; + + if (delta == 0) { + *error_code = DynamicRangeErrorCode::kDeltaIsZero; + return; + } + + if (start < end && delta < 0) { + *error_code = DynamicRangeErrorCode::kInvalidNegativeDelta; + return; + } + + if (start > end && delta > 0) { + *error_code = DynamicRangeErrorCode::kInvalidPositiveDelta; + return; + } + + if (*error_code == DynamicRangeErrorCode::kOk) { + int64_t real_output_shape = static_cast(ceil(static_cast(end - start) / delta)); + + // verification in case of precision error during calculation of real_output_shape. one multiplication followed by + // one addition is much more precise than the division that occurs when calculating real_output_shape. + double last_value = start + (delta * (real_output_shape - 1)); + double epsilon = 1e-6; + if ((end > start && last_value > end) || (start > end && last_value < end) || fabsf(last_value - end) < epsilon) { + real_output_shape--; + } + + if (real_output_shape > max_output_size) { + *error_code = DynamicRangeErrorCode::kMaxSizeExceeded; + } + *output_shape = real_output_shape; + } +} + +template +__global__ void Range(const T *range_start, const T *range_end, const T *range_delta, T *output, int64_t *output_shape, + const int64_t max_output_size) { + T start = range_start[0]; + T delta = range_delta[0]; + + size_t gt_id = blockIdx.x * blockDim.x + threadIdx.x; + for (; gt_id < *output_shape; gt_id += blockDim.x * gridDim.x) { + output[gt_id] = gt_id * delta + start; + } +} + +template +void CudaValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta, + int64_t *output_shape, DynamicRangeErrorCode *error_code, + const int64_t max_output_size, cudaStream_t cuda_stream) { + ValidateInputAndInferShape<<<1, 1, 0, cuda_stream>>>(range_start, range_end, range_delta, output_shape, error_code, + max_output_size); +} + +template +void CalRange(const T *range_start, const T *range_end, const T *range_delta, T *output, int64_t *output_shape, + DynamicRangeErrorCode *error_code, const int64_t max_output_size, cudaStream_t cuda_stream) { + Range<<>>(range_start, range_end, range_delta, + output, output_shape, max_output_size); +} + +template CUDA_LIB_EXPORT void CudaValidateInputAndInferShape(const int *range_start, const int *range_end, + const int *range_delta, int64_t *output_shape, + DynamicRangeErrorCode *error_code, + const int64_t max_output_size, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CudaValidateInputAndInferShape(const int64_t *range_start, + const int64_t *range_end, + const int64_t *range_delta, int64_t *output_shape, + DynamicRangeErrorCode *error_code, + const int64_t max_output_size, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CudaValidateInputAndInferShape(const float *range_start, const float *range_end, + const float *range_delta, int64_t *output_shape, + DynamicRangeErrorCode *error_code, + const int64_t max_output_size, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CudaValidateInputAndInferShape(const double *range_start, const double *range_end, + const double *range_delta, int64_t *output_shape, + DynamicRangeErrorCode *error_code, + const int64_t max_output_size, + cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void CalRange(const int *range_start, const int *range_end, const int *range_delta, + int *output, int64_t *output_shape, DynamicRangeErrorCode *error_code, + const int64_t max_output_size, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalRange(const int64_t *range_start, const int64_t *range_end, + const int64_t *range_delta, int64_t *output, int64_t *output_shape, + DynamicRangeErrorCode *error_code, const int64_t max_output_size, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalRange(const float *range_start, const float *range_end, + const float *range_delta, float *output, int64_t *output_shape, + DynamicRangeErrorCode *error_code, const int64_t max_output_size, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalRange(const double *range_start, const double *range_end, + const double *range_delta, double *output, int64_t *output_shape, + DynamicRangeErrorCode *error_code, const int64_t max_output_size, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cuh new file mode 100644 index 00000000000..f606e5d351e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cuh @@ -0,0 +1,40 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_RANGE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_RANGE_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +enum class DynamicRangeErrorCode { + kOk = 0, + kDeltaIsZero, + kInvalidPositiveDelta, + kInvalidNegativeDelta, + kMaxSizeExceeded +}; + +template +CUDA_LIB_EXPORT void CudaValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta, + int64_t *output_shape, DynamicRangeErrorCode *error_code, + const int64_t max_output_size, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CalRange(const T *range_start, const T *range_end, const T *range_delta, T *output, + int64_t *output_shape, DynamicRangeErrorCode *error_code, const int64_t max_output_size, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_RANGE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cu similarity index 96% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cu index 0586500f103..08d166da3d2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cu @@ -14,7 +14,6 @@ * limitations under the License. */ #include "dynamic_stitch_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" __global__ void StitchKernel(const int *index_addr, const unsigned char *data_addr, unsigned char *output_addr, const size_t index_num, const size_t data_size, int *max_index_dev) { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cuh similarity index 50% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cuh index bab5c8188a0..39fbf6cc5a0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cuh @@ -14,11 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_STITCH_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_STITCH_CUH_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_STITCH_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_STITCH_IMPL_CUH_ #include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" -void CallStitch(const int *index_addr, const unsigned char *data_addr, unsigned char *output_addr, - const size_t index_num, const size_t data_size, int *max_index_dev, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_STITCH_CUH_ +CUDA_LIB_EXPORT void CallStitch(const int *index_addr, const unsigned char *data_addr, unsigned char *output_addr, + const size_t index_num, const size_t data_size, int *max_index_dev, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_DYNAMIC_STITCH_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cu similarity index 73% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cu index 3a8abf65280..26a35f0f7cf 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cu @@ -15,8 +15,7 @@ */ #include #include "einsum_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template __global__ void Diagonal(const size_t out_size, const T *input, const size_t *inp_shape, const size_t shape_size, const size_t left_dim, const size_t right_dim, T *output) { @@ -64,15 +63,15 @@ void CalDiagonal(const size_t size, const T *input, const size_t *input_shape, c Diagonal<<>>(size, input, input_shape, shape_size, left_dim, right_dim, output); } -template void CalDiagonal(const size_t size, const double *input, const size_t *input_shape, - const size_t shape_size, const size_t left_dim, const size_t right_dim, - double *output, cudaStream_t cuda_stream); -template void CalDiagonal(const size_t size, const float *input, const size_t *input_shape, - const size_t shape_size, const size_t left_dim, const size_t right_dim, float *output, - cudaStream_t cuda_stream); -template void CalDiagonal(const size_t size, const half *input, const size_t *input_shape, - const size_t shape_size, const size_t left_dim, const size_t right_dim, half *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDiagonal(const size_t size, const double *input, const size_t *input_shape, + const size_t shape_size, const size_t left_dim, + const size_t right_dim, double *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDiagonal(const size_t size, const float *input, const size_t *input_shape, + const size_t shape_size, const size_t left_dim, const size_t right_dim, + float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDiagonal(const size_t size, const half *input, const size_t *input_shape, + const size_t shape_size, const size_t left_dim, const size_t right_dim, + half *output, cudaStream_t cuda_stream); template __global__ void DiagonalGrad(const size_t d_size, const T *dout, const size_t *inp_shape, const size_t shape_size, const size_t left_dim, const size_t right_dim, T *d_inp) { @@ -116,15 +115,15 @@ void CalDiagonalGrad(const size_t d_size, const T *dout, const size_t *input_sha DiagonalGrad<<>>(d_size, dout, input_shape, shape_size, left_dim, right_dim, d_inp); } -template void CalDiagonalGrad(const size_t size, const double *dout, const size_t *input_shape, - const size_t shape_size, const size_t left_dim, const size_t right_dim, - double *d_inp, cudaStream_t cuda_stream); -template void CalDiagonalGrad(const size_t size, const float *dout, const size_t *input_shape, - const size_t shape_size, const size_t left_dim, const size_t right_dim, - float *d_inp, cudaStream_t cuda_stream); -template void CalDiagonalGrad(const size_t size, const half *dout, const size_t *input_shape, - const size_t shape_size, const size_t left_dim, const size_t right_dim, half *d_inp, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDiagonalGrad(const size_t size, const double *dout, const size_t *input_shape, + const size_t shape_size, const size_t left_dim, + const size_t right_dim, double *d_inp, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDiagonalGrad(const size_t size, const float *dout, const size_t *input_shape, + const size_t shape_size, const size_t left_dim, + const size_t right_dim, float *d_inp, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDiagonalGrad(const size_t size, const half *dout, const size_t *input_shape, + const size_t shape_size, const size_t left_dim, + const size_t right_dim, half *d_inp, cudaStream_t cuda_stream); template __global__ void ReduceSum(const size_t out_size, const T *input, T *output, const size_t *out_shape, const size_t shape_size, const size_t reduce_dim, const size_t dim_val) { @@ -273,12 +272,12 @@ void CalDot(const size_t size, T *input_a, const T *input_b, T *output, cudaStre cudaFree(cur_out); } -template void CalDot(const size_t size, double *input_a, const double *input_b, double *output, - cudaStream_t cuda_stream); -template void CalDot(const size_t size, float *input_a, const float *input_b, float *output, - cudaStream_t cuda_stream); -template void CalDot(const size_t size, half *input_a, const half *input_b, half *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDot(const size_t size, double *input_a, const double *input_b, double *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDot(const size_t size, float *input_a, const float *input_b, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDot(const size_t size, half *input_a, const half *input_b, half *output, + cudaStream_t cuda_stream); template __global__ void DotGrad(const size_t size, const T dout, T *mid_res, T *input_b, T *input_a) { @@ -291,12 +290,12 @@ template void CalDotGrad(const size_t size, const T dout, T *mid_res, T *input_b, T *input_a, cudaStream_t cuda_stream) { DotGrad<<>>(size, dout, mid_res, input_b, input_a); } -template void CalDotGrad(const size_t size, const double dout, double *mid_res, double *input_b, - double *input_a, cudaStream_t cuda_stream); -template void CalDotGrad(const size_t size, const float dout, float *mid_res, float *input_b, float *input_a, - cudaStream_t cuda_stream); -template void CalDotGrad(const size_t size, const half dout, half *mid_res, half *input_b, half *input_a, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDotGrad(const size_t size, const double dout, double *mid_res, double *input_b, + double *input_a, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDotGrad(const size_t size, const float dout, float *mid_res, float *input_b, + float *input_a, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalDotGrad(const size_t size, const half dout, half *mid_res, half *input_b, + half *input_a, cudaStream_t cuda_stream); // Element-wise ArithMetic template __global__ void ElewiseArithMulKernel(const size_t nums, const T *x0, const T *x1, T *y) { @@ -343,14 +342,15 @@ void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft } } -template void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape, - const size_t lft_num, const size_t *rht_shape, const size_t rht_num, - const size_t *out_shape, const size_t out_num, const double *x0, const double *x1, - double *y, cudaStream_t stream); -template void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape, - const size_t lft_num, const size_t *rht_shape, const size_t rht_num, - const size_t *out_shape, const size_t out_num, const float *x0, const float *x1, float *y, - cudaStream_t stream); -template void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape, - const size_t lft_num, const size_t *rht_shape, const size_t rht_num, const size_t *out_shape, - const size_t out_num, const half *x0, const half *x1, half *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape, + const size_t lft_num, const size_t *rht_shape, const size_t rht_num, + const size_t *out_shape, const size_t out_num, const double *x0, + const double *x1, double *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape, + const size_t lft_num, const size_t *rht_shape, const size_t rht_num, + const size_t *out_shape, const size_t out_num, const float *x0, + const float *x1, float *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape, + const size_t lft_num, const size_t *rht_shape, const size_t rht_num, + const size_t *out_shape, const size_t out_num, const half *x0, + const half *x1, half *y, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cuh new file mode 100644 index 00000000000..f2073c8603d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cuh @@ -0,0 +1,62 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EINSUM_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EINSUM_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#include "include/cuda_fp16.h" +#define EINSUM_MAX_DIMENSION 20 +template +struct DynamicSharedMem; +template <> +struct DynamicSharedMem { + __device__ double *addr() { + extern __shared__ double addr_double[]; + return addr_double; + } +}; +template <> +struct DynamicSharedMem { + __device__ float *addr() { + extern __shared__ float addr_float[]; + return addr_float; + } +}; +template <> +struct DynamicSharedMem { + __device__ half *addr() { + extern __shared__ half addr_half[]; + return addr_half; + } +}; +template +CUDA_LIB_EXPORT void CalDiagonal(const size_t size, const T *input, const size_t *input_shape, const size_t shape_size, + const size_t left_dim, const size_t right_dim, T *output, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalDiagonalGrad(const size_t d_size, const T *dout, const size_t *input_shape, + const size_t shape_size, const size_t left_dim, const size_t right_dim, T *d_inp, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalDot(const size_t size, T *input_a, const T *input_b, T *output, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalDotGrad(const size_t size, const T dout, T *mid_res, T *input_b, T *input_a, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape, + const size_t lft_num, const size_t *rht_shape, const size_t rht_num, + const size_t *out_shape, const size_t out_num, const T *x0, const T *x1, T *y, + cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EINSUM_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cu new file mode 100644 index 00000000000..910469623c8 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cu @@ -0,0 +1,100 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh" +#include "include/cuda_fp16.h" + +template +__global__ void SubOffset(T *indices, size_t size, int64_t offset) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { + indices[pos] -= static_cast(offset); + } + return; +} + +template +void CalEmbeddingLookup(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2, + size_t input_dim1, int64_t offset, cudaStream_t stream) { + size_t size = output_dim0 * output_dim1 * output_dim2; + SubOffset<<>>(indices, output_dim1, offset); + GatherV2Kernel<<>>(input, indices, output, output_dim0, output_dim1, + output_dim2, input_dim1); + // restore indices + SubOffset<<>>(indices, output_dim1, -offset); + return; +} + +template CUDA_LIB_EXPORT void CalEmbeddingLookup(float *input, int *indices, float *output, + size_t output_dim0, size_t output_dim1, size_t output_dim2, + size_t input_dim1, int64_t offset, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(float *input, int64_t *indices, float *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(half *input, int *indices, half *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + int64_t offset, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(half *input, int64_t *indices, half *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(double *input, int *indices, double *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(double *input, int64_t *indices, double *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(int *input, int *indices, int *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + int64_t offset, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(int *input, int64_t *indices, int *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(int16_t *input, int *indices, int16_t *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(int16_t *input, int64_t *indices, int16_t *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, + int64_t offset, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(int8_t *input, int *indices, int8_t *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(int8_t *input, int64_t *indices, int8_t *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(uint8_t *input, int *indices, uint8_t *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(uint8_t *input, int64_t *indices, uint8_t *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, + int64_t offset, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(bool *input, int *indices, bool *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + int64_t offset, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalEmbeddingLookup(bool *input, int64_t *indices, bool *output, + size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh index b4f220171d0..26f625c08b9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh @@ -14,11 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EMBEDDING_LOOKUP_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EMBEDDING_LOOKUP_IMPL_CUH_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EMBEDDING_LOOKUP_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EMBEDDING_LOOKUP_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalEmbeddingLookup(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2, - size_t input_dim1, int64_t offset, cudaStream_t stream); +CUDA_LIB_EXPORT void CalEmbeddingLookup(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, int64_t offset, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EMBEDDING_LOOKUP_IMPL_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EMBEDDING_LOOKUP_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cu similarity index 62% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cu index 080ac397ba7..29df14fd9c3 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cu @@ -15,7 +15,7 @@ */ #include "equalcount_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "include/cuda_fp16.h" template __global__ void EqualCount(const int size, const T* input1, const T* input2, T* output) { T equal_count = 0; @@ -35,9 +35,9 @@ void CalEqualCount(const int size, const T* input1, const T* input2, T* output, return; } -template void CalEqualCount(const int size, const int* input1, const int* input2, int* output, - cudaStream_t cuda_stream); -template void CalEqualCount(const int size, const float* input1, const float* input2, float* output, - cudaStream_t cuda_stream); -template void CalEqualCount(const int size, const half* input1, const half* input2, half* output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalEqualCount(const int size, const int* input1, const int* input2, int* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalEqualCount(const int size, const float* input1, const float* input2, + float* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalEqualCount(const int size, const half* input1, const half* input2, half* output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cuh similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cuh index ba6004da3b8..a9b2dfa38fe 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cuh @@ -14,9 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EQUALCOUNT_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EQUALCOUNT_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EQUALCOUNT_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EQUALCOUNT_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalEqualCount(const int size, const T* input1, const T* input2, T* output, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalEqualCount(const int size, const T* input1, const T* input2, T* output, + cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EQUALCOUNT_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EQUALCOUNT_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cu new file mode 100644 index 00000000000..210a9adaf27 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cu @@ -0,0 +1,110 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh" +#include "include/cuda_fp16.h" + +template +__global__ void ExtractImagePatches(size_t output_size, int64_t stride_row, int64_t stride_col, int64_t rate_row, + int64_t rate_col, int64_t output_cols, bool need_batch, int64_t row_stride, + int64_t patch_stride, int64_t other_stride, int64_t input_row_size, + int64_t input_col_size, int64_t row_padding_top, int64_t col_padding_left, + int64_t col_input_stride, int64_t row_input_stride, int64_t patch_input_stride, + int64_t output_depth, const T *input, T *output) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_size; pos += blockDim.x * gridDim.x) { + const int64_t batch_index = need_batch ? (static_cast(pos) / other_stride) : 0; + const int64_t inner_index = + need_batch ? (static_cast(pos) - batch_index * other_stride) : static_cast(pos); + // inner index + const int64_t patch_index = inner_index / patch_stride; + const int64_t patch_offset = (inner_index - patch_index * patch_stride) / output_depth; + // row + const int64_t row_index = patch_index / output_cols; + const int64_t row_offset = patch_offset / row_stride; + const int64_t input_row = row_index * stride_row + row_offset * rate_row - row_padding_top; + if (input_row < 0 || input_row >= input_row_size) { + output[pos] = static_cast(0); + continue; + } + // col + const int64_t col_index = patch_index - row_index * output_cols; + const int64_t col_offset = patch_offset - row_offset * row_stride; + const int64_t input_col = col_index * stride_col + col_offset * rate_col - col_padding_left; + if (input_col < 0 || input_col >= input_col_size) { + output[pos] = static_cast(0); + continue; + } + // depth + const int64_t depth = inner_index - (inner_index / output_depth) * output_depth; + // input index + const int64_t input_index = + depth + input_col * col_input_stride + input_row * row_input_stride + batch_index * patch_input_stride; + output[pos] = input[static_cast(input_index)]; + } + return; +} + +template +void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, int64_t rate_row, + int64_t rate_col, int64_t output_cols, bool need_batch, int64_t row_stride, + int64_t patch_stride, int64_t other_stride, int64_t input_row_size, + int64_t input_col_size, int64_t row_padding_top, int64_t col_padding_left, + int64_t col_input_stride, int64_t row_input_stride, int64_t patch_input_stride, + int64_t output_depth, const T *input, T *output, cudaStream_t stream) { + ExtractImagePatches<<>>( + output_size, stride_row, stride_col, rate_row, rate_col, output_cols, need_batch, row_stride, patch_stride, + other_stride, input_row_size, input_col_size, row_padding_top, col_padding_left, col_input_stride, row_input_stride, + patch_input_stride, output_depth, input, output); +} + +template CUDA_LIB_EXPORT void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, + int64_t stride_col, int64_t rate_row, int64_t rate_col, + int64_t output_cols, bool need_batch, int64_t row_stride, + int64_t patch_stride, int64_t other_stride, + int64_t input_row_size, int64_t input_col_size, + int64_t row_padding_top, int64_t col_padding_left, + int64_t col_input_stride, int64_t row_input_stride, + int64_t patch_input_stride, int64_t output_depth, + const int *input, int *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, + int64_t stride_col, int64_t rate_row, int64_t rate_col, + int64_t output_cols, bool need_batch, + int64_t row_stride, int64_t patch_stride, + int64_t other_stride, int64_t input_row_size, + int64_t input_col_size, int64_t row_padding_top, + int64_t col_padding_left, int64_t col_input_stride, + int64_t row_input_stride, int64_t patch_input_stride, + int64_t output_depth, const float *input, float *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, + int64_t stride_col, int64_t rate_row, int64_t rate_col, + int64_t output_cols, bool need_batch, int64_t row_stride, + int64_t patch_stride, int64_t other_stride, + int64_t input_row_size, int64_t input_col_size, + int64_t row_padding_top, int64_t col_padding_left, + int64_t col_input_stride, int64_t row_input_stride, + int64_t patch_input_stride, int64_t output_depth, + const half *input, half *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, + int64_t stride_col, int64_t rate_row, int64_t rate_col, + int64_t output_cols, bool need_batch, + int64_t row_stride, int64_t patch_stride, + int64_t other_stride, int64_t input_row_size, + int64_t input_col_size, int64_t row_padding_top, + int64_t col_padding_left, int64_t col_input_stride, + int64_t row_input_stride, int64_t patch_input_stride, + int64_t output_depth, const double *input, + double *output, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh new file mode 100644 index 00000000000..9328c150f11 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh @@ -0,0 +1,32 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EXTRACT_IMAGE_PATCHES_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EXTRACT_IMAGE_PATCHES_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, + int64_t rate_row, int64_t rate_col, int64_t output_cols, + bool need_batch, int64_t row_stride, int64_t patch_stride, + int64_t other_stride, int64_t input_row_size, int64_t input_col_size, + int64_t row_padding_top, int64_t col_padding_left, + int64_t col_input_stride, int64_t row_input_stride, + int64_t patch_input_stride, int64_t output_depth, const T *input, + T *output, cudaStream_t stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EXTRACT_IMAGE_PATCHES_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cu similarity index 80% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cu index 715c135ec43..98279706ad7 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cu @@ -36,5 +36,7 @@ void Eye(const size_t size, const size_t dim, T *output_addr, cudaStream_t cuda_ return; } -template void Eye(const size_t size, const size_t dim, float *output_addr, cudaStream_t cuda_stream); -template void Eye(const size_t size, const size_t dim, double *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Eye(const size_t size, const size_t dim, float *output_addr, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Eye(const size_t size, const size_t dim, double *output_addr, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cuh similarity index 60% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cuh index bca33c388ac..0595131283b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/eye_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cuh @@ -14,11 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EYE_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EYE_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EYE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EYE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void Eye(const size_t size, const size_t dim, T *output_addr, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void Eye(const size_t size, const size_t dim, T *output_addr, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_EYE_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_EYE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cu similarity index 96% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cu index 621f9a46668..e07de17c9d9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cu @@ -21,7 +21,7 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void FakeLearnedScaleQuantPerChannel(float *output, const int size, float *input_alpha, float *input_quant, const int channel_num) { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh new file mode 100644 index 00000000000..f8f7817ede0 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh @@ -0,0 +1,34 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +CUDA_LIB_EXPORT void CalLSQNudgePerChannel(const float *input, const int size, float *input_alpha, + float *input_quant_max, float *input_div_alpha, float *input_quant, + const bool neg_trunc, const int channel_num, cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT void CalFakeLearnedScaleQuantPerChannel(float *output, const int size, float *input_alpha, + float *input_quant, const int channel_num, + cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT void CalFakeLearnedScaleQuantPerChannelGrad(float *grad_input, float *grad_alpha, const float *gradient, + const int size, const float *input_div_alpha, + const float *input_quant, const bool neg_trunc, + const int channel_num, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cu similarity index 96% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cu index a2fc40e3d65..0103794e3ce 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cu @@ -19,7 +19,7 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void FakeLearnedScaleQuantPerLayer(float *output, const int size, float *input_alpha, float *input_quant) { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh new file mode 100644 index 00000000000..b9a7067f5f3 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh @@ -0,0 +1,33 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERLAYER_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERLAYER_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +CUDA_LIB_EXPORT void CalLSQNudgePerLayer(const float *input, const int size, float *input_alpha, float *input_quant_max, + float *input_div_alpha, float *input_quant, const bool neg_trunc, + cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT void CalFakeLearnedScaleQuantPerLayer(float *output, const int size, float *input_alpha, + float *input_quant, cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT void CalFakeLearnedScaleQuantPerLayerGrad(float *grad_input, float *grad_alpha, const float *gradient, + const int size, const float *input_div_alpha, + const float *input_quant, const bool neg_trunc, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_LEARNED_SCALE_QUANT_PERLAYER_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cu similarity index 100% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cu diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh new file mode 100644 index 00000000000..c9a9c561814 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh @@ -0,0 +1,33 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERCHANNEL_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERCHANNEL_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +CUDA_LIB_EXPORT void CalNudgePerChannel(float *input_min, float *input_max, const float quant_min, + const float quant_max, float *nudge_min, float *nudge_max, float *scale, + const int channel_num, const bool symmetric, cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT void CalFakeQuantPerChannel(const float *input, float *output, const int total_num, + const int channel_num, const float *nudge_min, const float *nudge_max, + const float *scale, cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT void CalFakeQuantPerChannelGrad(const float *input, const float *gradient, float *output, + const int total_num, const int channel_num, const float *nudge_min, + const float *nudge_max, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERCHANNEL_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cu similarity index 100% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cu diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh new file mode 100644 index 00000000000..203190d9aef --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh @@ -0,0 +1,32 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERLAYER_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERLAYER_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +CUDA_LIB_EXPORT void CalNudgePerLayer(float *input_min, float *input_max, const float quant_min, const float quant_max, + float *nudge_min, float *nudge_max, float *scale, const bool symmetric, + cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT void CalFakeQuantPerLayer(const float *input, float *output, const int size, const float *nudge_min, + const float *nudge_max, const float *scale, cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT void CalFakeQuantPerLayerGrad(const float *input, const float *gradient, float *output, const int size, + const float *nudge_min, const float *nudge_max, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FAKE_QUANT_PERLAYER_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cu similarity index 64% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cu index 58995dee4cb..d493d338e6f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cu @@ -14,8 +14,9 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/fill_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cuh" #include "include/cuda_runtime.h" +#include "include/cuda_fp16.h" template __global__ void FillKernel(const size_t m, const size_t n, const T *input, T *output) { @@ -30,6 +31,9 @@ void Fill(const size_t &m, const size_t &n, const T *input, T *output, cudaStrea FillKernel<<<(m * n + 255) / 256, 256, 0, stream>>>(m, n, input, output); } -template void Fill(const size_t &m, const size_t &n, const float *input, float *output, cudaStream_t stream); -template void Fill(const size_t &m, const size_t &n, const half *input, half *output, cudaStream_t stream); -template void Fill(const size_t &m, const size_t &n, const double *input, double *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Fill(const size_t &m, const size_t &n, const float *input, float *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Fill(const size_t &m, const size_t &n, const half *input, half *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Fill(const size_t &m, const size_t &n, const double *input, double *output, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cuh similarity index 59% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cuh index acdcc191d58..0d304724857 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fill_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cuh @@ -14,9 +14,9 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FILL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FILL_H_ -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FILL_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FILL_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void Fill(const size_t &m, const size_t &n, const T *input, T *output, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FILL_H_ +CUDA_LIB_EXPORT void Fill(const size_t &m, const size_t &n, const T *input, T *output, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FILL_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cu similarity index 64% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cu index a27fe35e190..07d181cb1da 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cu @@ -15,7 +15,8 @@ */ #include "include/cuda_runtime.h" -#include "plugin/device/gpu/kernel/cuda_impl/float_status_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void IsNan(const size_t size, const T* input, bool* out) { @@ -126,15 +127,27 @@ void CalIsFinite(const size_t size, const T* input, bool* output, cudaStream_t c return; } -template void CalFloatStatus(const size_t size, const float* input, float* output, cudaStream_t cuda_stream); -template void CalFloatStatus(const size_t size, const half* input, float* output, cudaStream_t cuda_stream); -template void CalFloatStatus(const size_t size, const double* input, float* output, cudaStream_t cuda_stream); -template void CalIsInf(const size_t size, const float* input, bool* output, cudaStream_t cuda_stream); -template void CalIsInf(const size_t size, const half* input, bool* output, cudaStream_t cuda_stream); -template void CalIsInf(const size_t size, const double* input, bool* output, cudaStream_t cuda_stream); -template void CalIsNan(const size_t size, const float* input, bool* output, cudaStream_t cuda_stream); -template void CalIsNan(const size_t size, const half* input, bool* output, cudaStream_t cuda_stream); -template void CalIsNan(const size_t size, const double* input, bool* output, cudaStream_t cuda_stream); -template void CalIsFinite(const size_t size, const float* input, bool* output, cudaStream_t cuda_stream); -template void CalIsFinite(const size_t size, const half* input, bool* output, cudaStream_t cuda_stream); -template void CalIsFinite(const size_t size, const double* input, bool* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalFloatStatus(const size_t size, const float* input, float* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalFloatStatus(const size_t size, const half* input, float* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalFloatStatus(const size_t size, const double* input, float* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIsInf(const size_t size, const float* input, bool* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIsInf(const size_t size, const half* input, bool* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIsInf(const size_t size, const double* input, bool* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIsNan(const size_t size, const float* input, bool* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIsNan(const size_t size, const half* input, bool* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIsNan(const size_t size, const double* input, bool* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIsFinite(const size_t size, const float* input, bool* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIsFinite(const size_t size, const half* input, bool* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIsFinite(const size_t size, const double* input, bool* output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cuh new file mode 100644 index 00000000000..b1794e4d023 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cuh @@ -0,0 +1,28 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FLOAT_STATUS_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FLOAT_STATUS_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalFloatStatus(const size_t size, const T *input, float *output, cudaStream_t stream); +template +CUDA_LIB_EXPORT void CalIsNan(const size_t size, const T *input, bool *output, cudaStream_t stream); +template +CUDA_LIB_EXPORT void CalIsInf(const size_t size, const T *input, bool *output, cudaStream_t stream); +template +CUDA_LIB_EXPORT void CalIsFinite(const size_t size, const T *input, bool *output, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FLOAT_STATUS_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cu similarity index 76% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cu index dada21003b7..ca586df5afa 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cuh" +#include "include/cuda_fp16.h" template __device__ __forceinline__ T PowFunc(T x, T y) { @@ -77,11 +78,11 @@ void ApplyFtrl(const size_t size, const T *gradient, const T *learning_rate, con accumulation, linear); } -template void ApplyFtrl(const size_t size, const float *gradient, const float *learning_rate, - const float *l1_regularization, const float *l2_regularization, - const float *learning_rate_power, float *variable, float *accumulation, float *linear, - cudaStream_t cuda_stream); -template void ApplyFtrl(const size_t size, const half *gradient, const half *learning_rate, - const half *l1_regularization, const half *l2_regularization, - const half *learning_rate_power, half *variable, half *accumulation, half *linear, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyFtrl(const size_t size, const float *gradient, const float *learning_rate, + const float *l1_regularization, const float *l2_regularization, + const float *learning_rate_power, float *variable, float *accumulation, + float *linear, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ApplyFtrl(const size_t size, const half *gradient, const half *learning_rate, + const half *l1_regularization, const half *l2_regularization, + const half *learning_rate_power, half *variable, half *accumulation, + half *linear, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cuh similarity index 50% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cuh index 87add534c2c..0c9d261288b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cuh @@ -14,13 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FTRL_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FTRL_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FTRL_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FTRL_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void ApplyFtrl(const size_t size, const T *gradient, const T *learning_rate, const T *l1_regularization, - const T *l2_regularization, const T *learning_rate_power, T *variable, T *accumulation, T *linear, - cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void ApplyFtrl(const size_t size, const T *gradient, const T *learning_rate, const T *l1_regularization, + const T *l2_regularization, const T *learning_rate_power, T *variable, T *accumulation, + T *linear, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FTRL_IMPL_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_FTRL_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cu new file mode 100755 index 00000000000..63b72443404 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cu @@ -0,0 +1,150 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh" +#include "include/cuda_fp16.h" +template +__global__ void GatherKernel(const T *input, const S *index, T *output, const size_t dim_before_axis, + const size_t dim_at_axis_input, const size_t dim_at_axis_output, + const size_t dim_after_axis) { + size_t num = dim_before_axis * dim_at_axis_output * dim_after_axis; + size_t i, k; + for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < num; + id += blockDim.x * gridDim.x) { + i = id / (dim_at_axis_output * dim_after_axis); + k = id % dim_after_axis; + + S j = index[id]; + if (j < 0) { + j += static_cast(dim_at_axis_input); + } + CUDA_KERNEL_ASSERT(j >= 0); + size_t j_read = static_cast(j); + CUDA_KERNEL_ASSERT(j_read < dim_at_axis_input); + size_t read_id = i * dim_at_axis_input * dim_after_axis + j_read * dim_after_axis + k; + output[id] = input[read_id]; + } + return; +} +template +void Gather(const T *input, const S *index, T *output, const size_t dim_before_axis, + const size_t dim_at_axis_input, const size_t dim_at_axis_output, + const size_t dim_after_axis, cudaStream_t stream) { + size_t size = dim_before_axis * dim_at_axis_output * dim_after_axis; + GatherKernel<<>>(input, index, output, dim_before_axis, dim_at_axis_input, + dim_at_axis_output, dim_after_axis); + return; +} + +template CUDA_LIB_EXPORT void Gather(const double *input, const int *index, double *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const double *input, const int64_t *index, double *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const float *input, const int *index, float *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const float *input, const int64_t *index, float *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const half *input, const int *index, half *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const half *input, const int64_t *index, half *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const int64_t *input, const int *index, int64_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const int64_t *input, const int64_t *index, int64_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const int *input, const int *index, int *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const int *input, const int64_t *index, int *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const int16_t *input, const int *index, int16_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const int16_t *input, const int64_t *index, int16_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const int8_t *input, const int *index, int8_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const int8_t *input, const int64_t *index, int8_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const unsigned char *input, const int *index, + unsigned char *output, const size_t dim_before_axis, + const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const unsigned char *input, const int64_t *index, + unsigned char *output, const size_t dim_before_axis, + const size_t dim_at_axis_input, + const size_t dim_at_axis_output, + const size_t dim_after_axis, cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const bool *input, const int *index, bool *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const bool *input, const int64_t *index, bool *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const uint16_t *input, const int *index, uint16_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const uint16_t *input, const int64_t *index, uint16_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const uint32_t *input, const int *index, uint32_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const uint32_t *input, const int64_t *index, uint32_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const uint64_t *input, const int *index, uint64_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Gather(const uint64_t *input, const int64_t *index, uint64_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_input, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh index b6749798553..038841bd0af 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh @@ -14,11 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_GATHER_GPU_CU_H -#define MINDSPORE_GATHER_GPU_CU_H +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void Gather(const T *input, const S *index, T *output, const size_t dim_before_axis, - const size_t dim_at_axis_input, const size_t dim_at_axis_output, - const size_t dim_after_axis, cudaStream_t stream); +CUDA_LIB_EXPORT void Gather(const T *input, const S *index, T *output, const size_t dim_before_axis, + const size_t dim_at_axis_input, const size_t dim_at_axis_output, + const size_t dim_after_axis, cudaStream_t stream); -#endif +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cu new file mode 100755 index 00000000000..4e09bd1d8a6 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cu @@ -0,0 +1,154 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" + +template +__global__ void GatherGradKernel(const size_t num, const T *index, const S *grad, S *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis) { + size_t i, k; + + for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < num; + id += blockDim.x * gridDim.x) { + i = id / (dim_at_axis_index * dim_after_axis); + k = id % dim_after_axis; + + T j = index[id]; + if (j < 0) { + j += static_cast(dim_at_axis_output); + } + CUDA_KERNEL_ASSERT(j >= 0); + size_t j_read = static_cast(j); + CUDA_KERNEL_ASSERT(j_read < dim_at_axis_output); + size_t read_id = i * dim_at_axis_output * dim_after_axis + j_read * dim_after_axis + k; + MsAtomicAdd(output + read_id, grad[id]); + } + return; +} + +template +__global__ void InitOutput(const size_t size, S *output) { + S zero = 0; + for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < size; id += blockDim.x * gridDim.x) { + output[id] = zero; + } + return; +} + +template +void GatherGrad(const T *index, const S *grad, S *output, const size_t dim_before_axis, + const size_t dim_at_axis_index, const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream) { + size_t size = dim_before_axis * dim_at_axis_output * dim_after_axis; + InitOutput<<>>(size, output); + + size = dim_before_axis * dim_at_axis_index * dim_after_axis; + GatherGradKernel<<>>(size, index, grad, output, + dim_before_axis, dim_at_axis_index, + dim_at_axis_output, dim_after_axis); + return; +} + +template CUDA_LIB_EXPORT void GatherGrad(const int *index, const double *grad, double *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int64_t *index, const double *grad, double *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int *index, const float *grad, float *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int64_t *index, const float *grad, float *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int *index, const half *grad, half *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int64_t *index, const half *grad, half *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int *index, const int *grad, int *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int64_t *index, const int *grad, int *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int *index, const int8_t *grad, int8_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int64_t *index, const int8_t *grad, int8_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int *index, const int16_t *grad, int16_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int64_t *index, const int16_t *grad, int16_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int *index, const int64_t *grad, int64_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int64_t *index, const int64_t *grad, int64_t *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int *index, const unsigned char *grad, + unsigned char *output, const size_t dim_before_axis, + const size_t dim_at_axis_index, + const size_t dim_at_axis_output, + const size_t dim_after_axis, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int64_t *index, const unsigned char *grad, + unsigned char *output, const size_t dim_before_axis, + const size_t dim_at_axis_index, + const size_t dim_at_axis_output, + const size_t dim_after_axis, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int *index, const unsigned int *grad, + unsigned int *output, const size_t dim_before_axis, + const size_t dim_at_axis_index, + const size_t dim_at_axis_output, + const size_t dim_after_axis, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int64_t *index, const unsigned int *grad, + unsigned int *output, const size_t dim_before_axis, + const size_t dim_at_axis_index, + const size_t dim_at_axis_output, + const size_t dim_after_axis, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int *index, const bool *grad, bool *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherGrad(const int64_t *index, const bool *grad, bool *output, + const size_t dim_before_axis, const size_t dim_at_axis_index, + const size_t dim_at_axis_output, const size_t dim_after_axis, + cudaStream_t stream); + + + diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh similarity index 51% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh index 974e4e36268..b482c13b2fe 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh @@ -14,11 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_GATHER_GRAD_GPU_CU_H -#define MINDSPORE_GATHER_GRAD_GPU_CU_H +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_GRAD_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_GRAD_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void GatherGrad(const T *index, const S *grad, S *output, const size_t dim_before_axis, - const size_t dim_at_axis_index, const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); +CUDA_LIB_EXPORT void GatherGrad(const T *index, const S *grad, S *output, const size_t dim_before_axis, + const size_t dim_at_axis_index, const size_t dim_at_axis_output, + const size_t dim_after_axis, cudaStream_t stream); -#endif +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHER_GRAD_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cu new file mode 100644 index 00000000000..670f0909449 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cu @@ -0,0 +1,124 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh" +#include "include/cuda_fp16.h" +template +__global__ void GatherNdKernel(T *input, S *indices, T *output, const size_t output_dim0, const size_t output_dim1, + const size_t indices_dim1, S *batch_indices, S *batch_strides) { + int num = output_dim0 * output_dim1; + int i, j; + for (int write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; + write_index += blockDim.x * gridDim.x) { + i = write_index / output_dim1 % output_dim0; + j = write_index % output_dim1; + + bool out_of_bound = false; + int read_index = 0; + int indices_i = 0; + for (size_t k = 0; k < indices_dim1; k++) { + size_t ind = indices_dim1 * i + k; + indices_i = indices[ind]; + out_of_bound |= !(indices_i < batch_indices[k]); + read_index += indices_i * batch_strides[k]; + } + read_index += j; + + if (!out_of_bound) { + output[write_index] = input[read_index]; + } else { + output[write_index] = 0; + } + } + return; +} +template +void GatherNd(T *input, S *indices, T *output, const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, S *batch_indices, S *batch_strides, cudaStream_t stream) { + int size = output_dim0 * output_dim1; + GatherNdKernel<<>>(input, indices, output, output_dim0, output_dim1, + indices_dim1, batch_indices, batch_strides); + return; +} + +template CUDA_LIB_EXPORT void GatherNd(double *input, int *indices, double *output, + const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, int *batch_indices, int *batch_strides, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(float *input, int *indices, float *output, const size_t &output_dim0, + const size_t &output_dim1, const size_t &indices_dim1, + int *batch_indices, int *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(half *input, int *indices, half *output, const size_t &output_dim0, + const size_t &output_dim1, const size_t &indices_dim1, + int *batch_indices, int *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(int *input, int *indices, int *output, const size_t &output_dim0, + const size_t &output_dim1, const size_t &indices_dim1, + int *batch_indices, int *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(short *input, int *indices, short *output, const size_t &output_dim0, // NOLINT + const size_t &output_dim1, const size_t &indices_dim1, + int *batch_indices, int *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(unsigned int *input, int *indices, unsigned int *output, + const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, int *batch_indices, + int *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(char *input, int *indices, char *output, const size_t &output_dim0, + const size_t &output_dim1, const size_t &indices_dim1, + int *batch_indices, int *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(unsigned char *input, int *indices, unsigned char *output, + const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, int *batch_indices, + int *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(bool *input, int *indices, bool *output, const size_t &output_dim0, + const size_t &output_dim1, const size_t &indices_dim1, + int *batch_indices, int *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(double *input, int64_t *indices, double *output, + const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, int64_t *batch_indices, + int64_t *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(float *input, int64_t *indices, float *output, + const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, int64_t *batch_indices, + int64_t *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(half *input, int64_t *indices, half *output, + const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, int64_t *batch_indices, + int64_t *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(int *input, int64_t *indices, int *output, + const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, int64_t *batch_indices, + int64_t *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(short *input, int64_t *indices, short *output, // NOLINT + const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, int64_t *batch_indices, + int64_t *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(unsigned int *input, int64_t *indices, + unsigned int *output, const size_t &output_dim0, + const size_t &output_dim1, const size_t &indices_dim1, + int64_t *batch_indices, int64_t *batch_strides, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(char *input, int64_t *indices, char *output, + const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, int64_t *batch_indices, + int64_t *batch_strides, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(unsigned char *input, int64_t *indices, + unsigned char *output, const size_t &output_dim0, + const size_t &output_dim1, const size_t &indices_dim1, + int64_t *batch_indices, int64_t *batch_strides, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherNd(bool *input, int64_t *indices, bool *output, + const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, int64_t *batch_indices, + int64_t *batch_strides, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh index 127a45f1a25..8fe6e68298a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh @@ -14,13 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERND_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERND_CUH_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERND_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERND_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void GatherNd(T *input, S *indices, T *output, const size_t &output_dim0, const size_t &output_dim1, - const size_t &indices_dim1, S *batch_indices, S *batch_strides, cudaStream_t stream); +CUDA_LIB_EXPORT void GatherNd(T *input, S *indices, T *output, const size_t &output_dim0, const size_t &output_dim1, + const size_t &indices_dim1, S *batch_indices, S *batch_strides, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERND_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERND_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cu new file mode 100755 index 00000000000..7cb9a1d0bbe --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cu @@ -0,0 +1,103 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh" +#include "include/cuda_fp16.h" +template +__global__ void GatherV2Kernel(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1) { + size_t num = output_dim0 * output_dim1 * output_dim2; + size_t i, j, k; + for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; + write_index += blockDim.x * gridDim.x) { + i = write_index / (output_dim1 * output_dim2) % output_dim0; + j = write_index / output_dim2 % output_dim1; + k = write_index % output_dim2; + + if ((indices[j] >= 0) && (indices[j] < input_dim1)) { + size_t read_index = i * input_dim1 * output_dim2 + indices[j] * output_dim2 + k; + output[write_index] = input[read_index]; + } else { + output[write_index] = 0; + } + } + + return; +} +template +void GatherV2(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2, + size_t input_dim1, cudaStream_t stream) { + size_t size = output_dim0 * output_dim1 * output_dim2; + GatherV2Kernel<<>>(input, indices, output, output_dim0, output_dim1, + output_dim2, input_dim1); + return; +} + +template CUDA_LIB_EXPORT void GatherV2(float *input, int *indices, float *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(float *input, int64_t *indices, float *output, + size_t output_dim0, size_t output_dim1, size_t output_dim2, + size_t input_dim1, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(half *input, int *indices, half *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(half *input, int64_t *indices, half *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(double *input, int *indices, double *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(double *input, int64_t *indices, double *output, + size_t output_dim0, size_t output_dim1, size_t output_dim2, + size_t input_dim1, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(int *input, int *indices, int *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(int *input, int64_t *indices, int *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(int16_t *input, int *indices, int16_t *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(int16_t *input, int64_t *indices, int16_t *output, + size_t output_dim0, size_t output_dim1, size_t output_dim2, + size_t input_dim1, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(int8_t *input, int *indices, int8_t *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(int8_t *input, int64_t *indices, int8_t *output, + size_t output_dim0, size_t output_dim1, size_t output_dim2, + size_t input_dim1, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(uint32_t *input, int *indices, uint32_t *output, + size_t output_dim0, size_t output_dim1, size_t output_dim2, + size_t input_dim1, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(uint32_t *input, int64_t *indices, uint32_t *output, + size_t output_dim0, size_t output_dim1, size_t output_dim2, + size_t input_dim1, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(uint8_t *input, int *indices, uint8_t *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(uint8_t *input, int64_t *indices, uint8_t *output, + size_t output_dim0, size_t output_dim1, size_t output_dim2, + size_t input_dim1, cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(bool *input, int *indices, bool *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); +template CUDA_LIB_EXPORT void GatherV2(bool *input, int64_t *indices, bool *output, size_t output_dim0, + size_t output_dim1, size_t output_dim2, size_t input_dim1, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh similarity index 62% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh index 944b08f9596..aa82f2f74b2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh @@ -14,15 +14,15 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERV2_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERV2_CUH_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERV2_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERV2_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void GatherV2(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2, - size_t input_dim1, cudaStream_t stream); +CUDA_LIB_EXPORT void GatherV2(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, + size_t output_dim2, size_t input_dim1, cudaStream_t stream); template __global__ void GatherV2Kernel(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2, size_t input_dim1); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_GATHERV2_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GATHERV2_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cu similarity index 89% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cu index 0d6b5614b47..2b46041b1ac 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void GeluKernel(size_t size, T *input_addr, T *output_addr) { @@ -127,7 +127,9 @@ void GeluGradKernel(size_t size, half *dy_addr, half *x_addr, half *dx_addr, cud } } -template void Gelu(size_t size, float *input_addr, float *output_addr, cudaStream_t cuda_stream); -template void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream); -template void GeluGradKernel(size_t size, float *dy_addr, float *x_addr, float *dx_addr, cudaStream_t cuda_stream); -template void GeluGradKernel(size_t size, half *dy_addr, half *x_addr, half *dx_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Gelu(size_t size, float *input_addr, float *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void GeluGradKernel(size_t size, float *dy_addr, float *x_addr, float *dx_addr, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void GeluGradKernel(size_t size, half *dy_addr, half *x_addr, half *dx_addr, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh similarity index 62% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh index 43992e3d260..d856c4b3afc 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh @@ -14,14 +14,13 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SOFTPLUS_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SOFTPLUS_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GELU_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GELU_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void Softplus(const size_t input_size, const T* input_addr, T* output_addr, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void Gelu(size_t input_size, T* input_addr, T* output_addr, cudaStream_t cuda_stream); template -void SoftplusGrad(const size_t size, const T* dy_addr, const T* x_addr, T* dx_addr, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void GeluGradKernel(size_t size, T* dy_addr, T* x_addr, T* dx_addr, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SOFTPLUS_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GELU_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cu similarity index 92% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cu index 195f5f5ea03..500fea07afa 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cu @@ -16,9 +16,8 @@ #include #include -#include "plugin/device/gpu/hal/device/cuda_common.h" #include "include/cuda_fp16.h" -#include "plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh" const int kWarpSize = 32; const int kBlockSize = 512; @@ -327,9 +326,12 @@ void CalGeneralReduction(bool small, const T *input, const size_t bound, const s return; } -template void CalGeneralReduction(bool small, const double *input, const size_t bound_, const size_t outerSize_, - const size_t innerSize_, int *index, double *output, cudaStream_t cuda_stream); -template void CalGeneralReduction(bool small, const float *input, const size_t bound_, const size_t outerSize_, - const size_t innerSize_, int *index, float *output, cudaStream_t cuda_stream); -template void CalGeneralReduction(bool small, const half *input, const size_t bound_, const size_t outerSize_, - const size_t innerSize_, int *index, half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalGeneralReduction(bool small, const double *input, const size_t bound_, + const size_t outerSize_, const size_t innerSize_, int *index, + double *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalGeneralReduction(bool small, const float *input, const size_t bound_, + const size_t outerSize_, const size_t innerSize_, int *index, + float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalGeneralReduction(bool small, const half *input, const size_t bound_, + const size_t outerSize_, const size_t innerSize_, int *index, + half *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh index b09cf08e4cc..fc6b7237eaf 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh @@ -14,9 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GENERAL_REDUCTION_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GENERAL_REDUCTION_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GENERAL_REDUCTION_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GENERAL_REDUCTION_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalGeneralReduction(bool small, const T *input, const size_t bound_, const size_t outerSize_, - const size_t innerSize_, S *index, T *output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GENERAL_REDUCTION_H_ +CUDA_LIB_EXPORT void CalGeneralReduction(bool small, const T *input, const size_t bound_, const size_t outerSize_, + const size_t innerSize_, S *index, T *output, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_GENERAL_REDUCTION_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cu similarity index 80% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cu index fa7550e228b..9e28741bd8c 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cu @@ -15,7 +15,6 @@ */ #include "hash_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" template __global__ void HashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size, @@ -57,8 +56,10 @@ void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_inde return; } -template void DoHashSwapOut(const float *hash_table, float *swap_out_value, const int *swap_out_index, - const int index_size, const int hash_dim, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void DoHashSwapOut(const float *hash_table, float *swap_out_value, + const int *swap_out_index, const int index_size, const int hash_dim, + cudaStream_t cuda_stream); -template void DoHashSwapIn(float *hash_table, const float *swap_in_value, const int *swap_in_index, - const int index_size, const int hash_dim, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void DoHashSwapIn(float *hash_table, const float *swap_in_value, + const int *swap_in_index, const int index_size, const int hash_dim, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cuh new file mode 100755 index 00000000000..1748047af93 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cuh @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HASH_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HASH_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void DoHashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, + const int index_size, const int hash_dim, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, + const int index_size, const int hash_dim, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HASH_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cu similarity index 68% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cu index 6558715cad0..f9b897419f3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void HsigmoidKernel(size_t size, const T *input, T *output) { @@ -43,10 +44,12 @@ void CalHSigmoidGrad(const size_t &size, const T *dout, const T *x, T *output, c HsigmoidGradKernel<<>>(size, dout, x, output); } -template void CalHSigmoid(const size_t &size, const half *input, half *output, cudaStream_t cuda_stream); -template void CalHSigmoid(const size_t &size, const float *input, float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalHSigmoid(const size_t &size, const half *input, half *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalHSigmoid(const size_t &size, const float *input, float *output, + cudaStream_t cuda_stream); -template void CalHSigmoidGrad(const size_t &size, const half *dout, const half *x, half *output, - cudaStream_t cuda_stream); -template void CalHSigmoidGrad(const size_t &size, const float *dout, const float *x, float *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalHSigmoidGrad(const size_t &size, const half *dout, const half *x, half *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalHSigmoidGrad(const size_t &size, const float *dout, const float *x, + float *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh new file mode 100644 index 00000000000..fd61ec0aabf --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh @@ -0,0 +1,29 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSIGMOID_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSIGMOID_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void CalHSigmoid(const size_t &size, const T *input, T *output, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CalHSigmoidGrad(const size_t &size, const T *dout, const T *x, T *output, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSIGMOID_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cu similarity index 73% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cu index 982e0eba2fb..22f11903e57 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void HSwishKernel(size_t size, const T *input, T *output) { @@ -62,10 +63,12 @@ void CalHSwishGrad(const size_t &size, const T *dout, const T *x, T *output, cud HSwishGradKernel<<>>(size, dout, x, output); } -template void CalHSwish(const size_t &size, const half *input, half *output, cudaStream_t cuda_stream); -template void CalHSwish(const size_t &size, const float *input, float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalHSwish(const size_t &size, const half *input, half *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalHSwish(const size_t &size, const float *input, float *output, + cudaStream_t cuda_stream); -template void CalHSwishGrad(const size_t &size, const half *dout, const half *x, half *output, - cudaStream_t cuda_stream); -template void CalHSwishGrad(const size_t &size, const float *dout, const float *x, float *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalHSwishGrad(const size_t &size, const half *dout, const half *x, half *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalHSwishGrad(const size_t &size, const float *dout, const float *x, float *output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh similarity index 55% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh index 416f40f52d2..18cfb4bf007 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh @@ -14,16 +14,15 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSWISH_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSWISH_IMPL_CUH_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSWISH_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSWISH_IMPL_CUH_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalHSwish(const size_t &size, const T *input, T *output, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalHSwish(const size_t &size, const T *input, T *output, cudaStream_t cuda_stream); template -void CalHSwishGrad(const size_t &size, const T *dout, const T *x, T *output, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalHSwishGrad(const size_t &size, const T *dout, const T *x, T *output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSWISH_IMPL_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_HSWISH_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cu similarity index 72% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cu index 8a10e340992..b7571578080 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cu @@ -14,10 +14,8 @@ * limitations under the License. */ #include "in_top_k_impl.cuh" - #include - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "include/cuda_fp16.h" template __global__ void InTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, @@ -39,9 +37,10 @@ void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const batch_size, class_id_count, k); } -template void CalInTopK(const half *predictions, const int32_t *targets, bool *output, const half *top_k_output, - size_t batch_size, size_t class_id_count, int64_t k, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalInTopK(const half *predictions, const int32_t *targets, bool *output, + const half *top_k_output, size_t batch_size, size_t class_id_count, + int64_t k, cudaStream_t cuda_stream); -template void CalInTopK(const float *predictions, const int32_t *targets, bool *output, - const float *top_k_output, size_t batch_size, size_t class_id_count, int64_t k, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalInTopK(const float *predictions, const int32_t *targets, bool *output, + const float *top_k_output, size_t batch_size, size_t class_id_count, + int64_t k, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cuh similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cuh index f72b20ab434..5e3fa35c67f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cuh @@ -14,13 +14,13 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_IN_TOP_K_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_IN_TOP_K_CUH_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IN_TOP_K_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IN_TOP_K_IMPL_CUH_ #include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t batch_size, - size_t class_id_count, int64_t k, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, + size_t batch_size, size_t class_id_count, int64_t k, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_IN_TOP_K_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IN_TOP_K_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cu similarity index 50% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cu index d5743543d23..31921bb7610 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cu @@ -14,10 +14,8 @@ * limitations under the License. */ #include -#include "plugin/device/gpu/kernel/cuda_impl/index_add_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "include/cuda_fp16.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template __global__ void IndexAddAtomic(T *dst, const int *index, const T *src, const size_t src_size, const size_t outer_size, const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size) { @@ -61,24 +59,29 @@ void CalIndexAdd(T *dst, const int *index, const T *src, const size_t outer_size } } -template void CalIndexAdd(double *dst, const int *index, const double *src, const size_t outer_size, - const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock, - cudaStream_t cuda_stream); -template void CalIndexAdd(float *dst, const int *index, const float *src, const size_t outer_size, - const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock, - cudaStream_t cuda_stream); -template void CalIndexAdd(half *dst, const int *index, const half *src, const size_t outer_size, - const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock, - cudaStream_t cuda_stream); -template void CalIndexAdd(int *dst, const int *index, const int *src, const size_t outer_size, - const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock, - cudaStream_t cuda_stream); -template void CalIndexAdd(int16_t *dst, const int *index, const int16_t *src, const size_t outer_size, - const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock, - cudaStream_t cuda_stream); -template void CalIndexAdd(int8_t *dst, const int *index, const int8_t *src, const size_t outer_size, - const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock, - cudaStream_t cuda_stream); -template void CalIndexAdd(uint8_t *dst, const int *index, const uint8_t *src, const size_t outer_size, - const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, const bool use_lock, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIndexAdd(double *dst, const int *index, const double *src, + const size_t outer_size, const size_t src_axis_size, + const size_t dst_axis_size, const size_t inner_size, + const bool use_lock, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIndexAdd(float *dst, const int *index, const float *src, + const size_t outer_size, const size_t src_axis_size, + const size_t dst_axis_size, const size_t inner_size, + const bool use_lock, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIndexAdd(half *dst, const int *index, const half *src, const size_t outer_size, + const size_t src_axis_size, const size_t dst_axis_size, + const size_t inner_size, const bool use_lock, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIndexAdd(int *dst, const int *index, const int *src, const size_t outer_size, + const size_t src_axis_size, const size_t dst_axis_size, + const size_t inner_size, const bool use_lock, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIndexAdd(int16_t *dst, const int *index, const int16_t *src, + const size_t outer_size, const size_t src_axis_size, + const size_t dst_axis_size, const size_t inner_size, + const bool use_lock, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIndexAdd(int8_t *dst, const int *index, const int8_t *src, + const size_t outer_size, const size_t src_axis_size, + const size_t dst_axis_size, const size_t inner_size, + const bool use_lock, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalIndexAdd(uint8_t *dst, const int *index, const uint8_t *src, + const size_t outer_size, const size_t src_axis_size, + const size_t dst_axis_size, const size_t inner_size, + const bool use_lock, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cuh similarity index 50% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cuh index ab993c93603..ef753c429f6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/index_add_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cuh @@ -14,9 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_INDEXADD_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_INDEXADD_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INDEX_ADD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INDEX_ADD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalIndexAdd(T *dst, const int *index, const T *src, const size_t outer_size, const size_t src_axis_size, - const size_t dst_axis_size, const size_t inner_size, const bool use_lock, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_INDEXADD_H_ +CUDA_LIB_EXPORT void CalIndexAdd(T *dst, const int *index, const T *src, const size_t outer_size, + const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size, + const bool use_lock, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INDEX_ADD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cu similarity index 96% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cu index 3698c082345..bb948cc8e75 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void CopyMemKernel(const size_t thread_num, const size_t N, const size_t C, float *gamma_addr, float *beta_addr, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh new file mode 100644 index 00000000000..56e1d869c69 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh @@ -0,0 +1,25 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INSTANCE_NORM_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INSTANCE_NORM_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +CUDA_LIB_EXPORT void CopyMemDevice2Device(const size_t N, const size_t C, float *gamma_addr, float *beta_addr, + float *runing_mean_addr, float *runnig_variance_addr, float *ws_gamma, + float *ws_beta, float *ws_mean, float *ws_var, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void ComputeMean(const size_t N, const size_t C, float *dgamma, float *dbeta, const float *ws_dgamma, + const float *ws_dbeta, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INSTANCE_NORM_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cu similarity index 85% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cu index ca8c51e54e5..819a7f54581 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/iou_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cuh" +#include "include/cuda_fp16.h" __device__ float CoordinateMax(const float a, const float b) { return (a > b ? a : b); @@ -67,7 +68,7 @@ void IOU(const size_t &size, const T *box1, const T *box2, T *iou_results, const IOUKernel<<>>(size, box1, box2, iou_results, mode, input_len_0); } -template void IOU(const size_t &size, const float *box1, const float *box2, float *iou_results, const size_t &mode, - const size_t &input_len_0, cudaStream_t cuda_stream); -template void IOU(const size_t &size, const half *box1, const half *box2, half *iou_results, const size_t &mode, - const size_t &input_len_0, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void IOU(const size_t &size, const float *box1, const float *box2, float *iou_results, + const size_t &mode, const size_t &input_len_0, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void IOU(const size_t &size, const half *box1, const half *box2, half *iou_results, + const size_t &mode, const size_t &input_len_0, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cuh similarity index 57% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cuh index c9d0de6238b..7e18195ea53 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/iou_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cuh @@ -14,16 +14,15 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_IOU_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_IOU_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IOU_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IOU_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #define IOU_LOCATION_NUM 2 #define IOU_DIMENSION 4 template -void IOU(const size_t &size, const T *box1, const T *box2, T *iou_results, const size_t &mode, - const size_t &input_len_0, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void IOU(const size_t &size, const T *box1, const T *box2, T *iou_results, const size_t &mode, + const size_t &input_len_0, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_IOU_IMPL_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_IOU_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cu similarity index 77% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cu index 6c303594cfa..ba136cc10e0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cu @@ -15,8 +15,7 @@ */ #include "l2_loss.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template __global__ void L2LossKernel(const size_t input_size, const T *input , T *output) { @@ -39,5 +38,7 @@ void L2Loss(const size_t input_size, const T *input , T *output, cudaStream_t st L2LossKernel<<>>(input_size, input, output); } -template void L2Loss(const size_t input_size, const float *input , float *output, cudaStream_t stream); -template void L2Loss(const size_t input_size, const half *input , half *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void L2Loss(const size_t input_size, const float *input , float *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void L2Loss(const size_t input_size, const half *input , half *output, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cuh similarity index 60% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cuh index 428451c84fe..b8d544dea07 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2_loss.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cuh @@ -14,8 +14,9 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_L2_LOSS_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_L2_LOSS_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2_LOSS_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2_LOSS_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void L2Loss(const size_t input_size, const T *input , T *output, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_L2_LOSS_H_ +CUDA_LIB_EXPORT void L2Loss(const size_t input_size, const T *input , T *output, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2_LOSS_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cu similarity index 67% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cu index 25603f50874..7d617ba33fb 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cu @@ -15,7 +15,6 @@ */ #include "l2normalize_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" #include "include/cuda_fp16.h" template __global__ void AssignEps(const size_t size, const float eps, T* value) { @@ -31,6 +30,9 @@ void GetMaxWithEpsAndValue(const size_t size, const float eps, T* value, cudaStr AssignEps<<>>(size, eps, value); } -template void GetMaxWithEpsAndValue(const size_t size, const float eps, float* value, cudaStream_t cuda_stream); -template void GetMaxWithEpsAndValue(const size_t size, const float eps, half* value, cudaStream_t cuda_stream); -template void GetMaxWithEpsAndValue(const size_t size, const float eps, int* value, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void GetMaxWithEpsAndValue(const size_t size, const float eps, float* value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void GetMaxWithEpsAndValue(const size_t size, const float eps, half* value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void GetMaxWithEpsAndValue(const size_t size, const float eps, int* value, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cuh similarity index 58% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cuh index 1f37cef9158..9fe7a4d2145 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cuh @@ -14,9 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_L2NORMALIZE_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_L2NORMALIZE_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2NORMALIZE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2NORMALIZE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void GetMaxWithEpsAndValue(const size_t size, const float eps, T* value, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void GetMaxWithEpsAndValue(const size_t size, const float eps, T* value, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_L2NORMALIZE_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_L2NORMALIZE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cu similarity index 92% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cu index 3cf14c491e9..af8bd26b6a2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cu @@ -17,8 +17,9 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh" +#include "include/cuda_fp16.h" constexpr int THREAD_PER_BLOCK = 256; constexpr int NUM_PER_THREAD_REDUCE = 4; @@ -404,12 +405,15 @@ void LayerNormGradGrad(const int &row_dim, const int &col_dim, const int ¶m_ } -template void LayerNormGradGrad(const int &row_dim, const int &col_dim, const int ¶m_dim, float *global_sum1, - float *global_sum2, const float &epsilon, const float *dy, const float *x, - const float *mean, const float *var, const float *gamma, const float *grad_dx, - const float *grad_dg, const float *grad_db, float *d_dy, float *d_x, float *d_gamma, - cudaStream_t stream); -template void LayerNormGradGrad(const int &row_dim, const int &col_dim, const int ¶m_dim, half *global_sum1, - half *global_sum2, const half &epsilon, const half *dy, const half *x, const half *mean, - const half *var, const half *gamma, const half *grad_dx, const half *grad_dg, - const half *grad_db, half *d_dy, half *d_x, half *d_gamma, cudaStream_t stream); +template CUDA_LIB_EXPORT void LayerNormGradGrad(const int &row_dim, const int &col_dim, const int ¶m_dim, + float *global_sum1, float *global_sum2, const float &epsilon, + const float *dy, const float *x, const float *mean, const float *var, + const float *gamma, const float *grad_dx, const float *grad_dg, + const float *grad_db, float *d_dy, float *d_x, float *d_gamma, + cudaStream_t stream); +template CUDA_LIB_EXPORT void LayerNormGradGrad(const int &row_dim, const int &col_dim, const int ¶m_dim, + half *global_sum1, half *global_sum2, const half &epsilon, + const half *dy, const half *x, const half *mean, const half *var, + const half *gamma, const half *grad_dx, const half *grad_dg, + const half *grad_db, half *d_dy, half *d_x, half *d_gamma, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cuh new file mode 100644 index 00000000000..134e90d0e29 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cuh @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void LayerNormGradGrad(const int& row_dim, const int& col_dim, const int& param_dim, T* global_sum1, + T* global_sum2, const T& epsilon, const T* dy, const T* x, const T* mean, + const T* var, const T* gamma, const T* grad_dx, const T* grad_dg, + const T* grad_db, T* d_dy, T* d_x, T* d_gamma, cudaStream_t stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cu similarity index 91% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cu index 9133d4f35eb..974f2e488f3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cu @@ -17,8 +17,9 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh" +#include "include/cuda_fp16.h" constexpr int NUM_PER_THREAD_REDUCE = 4; constexpr int WARP_SIZE = 32; @@ -249,9 +250,11 @@ void LayerNormGrad(const int &row_dim, const int &col_dim, const int ¶m_dim, epsilon, dy, x, mean, var, dg, db); } -template void LayerNormGrad(const int &row_dim, const int &col_dim, const int ¶m_dim, const float &epsilon, - const float *dy, const float *x, const float *mean, const float *var, const float *gamma, - float *dx, float *dg, float *db, cudaStream_t stream); -template void LayerNormGrad(const int &row_dim, const int &col_dim, const int ¶m_dim, const half &epsilon, - const half *dy, const half *x, const half *mean, const half *var, const half *gamma, - half *dx, half *dg, half *db, cudaStream_t stream); +template CUDA_LIB_EXPORT void LayerNormGrad(const int &row_dim, const int &col_dim, const int ¶m_dim, + const float &epsilon, const float *dy, const float *x, const float *mean, + const float *var, const float *gamma, float *dx, float *dg, float *db, + cudaStream_t stream); +template CUDA_LIB_EXPORT void LayerNormGrad(const int &row_dim, const int &col_dim, const int ¶m_dim, + const half &epsilon, const half *dy, const half *x, const half *mean, + const half *var, const half *gamma, half *dx, half *dg, half *db, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cuh new file mode 100644 index 00000000000..29ce6723579 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void LayerNormGrad(const int& row_dim, const int& col_dim, const int& param_dim, const T& epsilon, + const T* dy, const T* x, const T* mean, const T* var, const T* gamma, T* dx, T* dg, + T* db, cudaStream_t stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cu similarity index 89% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cu index 91bc9710276..2fcfdba78c9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cu @@ -17,7 +17,7 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh" constexpr int NUM_PER_THREAD_REDUCE = 4; constexpr int WARP_SIZE = 32; @@ -149,9 +149,9 @@ void LayerNorm(const int &row_dim, const int &col_dim, const int ¶m_dim, con beta, y, mean, var); } -template void LayerNorm(const int &row_dim, const int &col_dim, const int ¶m_dim, const float &epsilon, - const float *x, const float *gamma, const float *beta, float *y, float *mean, float *var, - cudaStream_t stream); -template void LayerNorm(const int &row_dim, const int &col_dim, const int ¶m_dim, const half &epsilon, - const half *x, const half *gamma, const half *beta, half *y, half *mean, half *var, - cudaStream_t stream); +template CUDA_LIB_EXPORT void LayerNorm(const int &row_dim, const int &col_dim, const int ¶m_dim, + const float &epsilon, const float *x, const float *gamma, const float *beta, + float *y, float *mean, float *var, cudaStream_t stream); +template CUDA_LIB_EXPORT void LayerNorm(const int &row_dim, const int &col_dim, const int ¶m_dim, + const half &epsilon, const half *x, const half *gamma, const half *beta, + half *y, half *mean, half *var, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh similarity index 62% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh index cb1674bbc20..5f4ea1ab38e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh @@ -14,10 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#include "include/cuda_fp16.h" template struct DynamicSharedMem; @@ -37,7 +37,7 @@ struct DynamicSharedMem { }; template -void LayerNorm(const int& outer, const int& inner, const int& param_dim, const T& epsilon, const T* x, const T* gamma, - const T* beta, T* y, T* mean, T* var, cudaStream_t stream); +CUDA_LIB_EXPORT void LayerNorm(const int& outer, const int& inner, const int& param_dim, const T& epsilon, const T* x, + const T* gamma, const T* beta, T* y, T* mean, T* var, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LAYER_NORM_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cu similarity index 81% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cu index 30bd353b6a5..d0f5ed05851 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/linspace.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cuh" #include template @@ -28,5 +28,5 @@ template void calLinSpace(const T *start, const T *stop, const size_t value_count, T *output, cudaStream_t cuda_stream) { LinSpaceKernel<<>>(start, stop, value_count, output); } -template void calLinSpace(const float *start, const float *stop, const size_t value_count, float *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void calLinSpace(const float *start, const float *stop, const size_t value_count, + float *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cuh similarity index 56% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cuh index c0d3474b187..4948bb9dd22 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/linspace.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cuh @@ -14,10 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_LINSPACE_IMPL_CU_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_LINSPACE_IMPL_CU_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LINSPACE_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LINSPACE_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void calLinSpace(const T *start, const T *stop, const size_t value_count, T *output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_LINSPACE_IMPL_CU_H_ +CUDA_LIB_EXPORT void calLinSpace(const T *start, const T *stop, const size_t value_count, T *output, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LINSPACE_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cu similarity index 64% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cu index 1d8b4c33408..85de67182c7 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cu @@ -14,8 +14,7 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh" #include "include/cuda_fp16.h" template @@ -87,18 +86,24 @@ void CalLocalResponseNormGradNHWC(const T *dy, const T *x, const T *y, const int return; } -template void CalLocalResponseNormNHWC(const float *input, const int depth_radius, const float bias, - const float alpha, const float beta, const size_t channels, const size_t num_elements, float *scale, float *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalLocalResponseNormNHWC(const float *input, const int depth_radius, + const float bias, const float alpha, const float beta, + const size_t channels, const size_t num_elements, + float *scale, float *output, cudaStream_t cuda_stream); -template void CalLocalResponseNormNHWC(const half *input, const int depth_radius, const float bias, - const float alpha, const float beta, const size_t channels, const size_t num_elements, float *scale, half *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalLocalResponseNormNHWC(const half *input, const int depth_radius, + const float bias, const float alpha, const float beta, + const size_t channels, const size_t num_elements, + float *scale, half *output, cudaStream_t cuda_stream); -template void CalLocalResponseNormGradNHWC(const float *dy, const float *x, const float *y, - const int depth_radius, const float bias, const float alpha, const float beta, const size_t channels, - const size_t num_elements, float *scale, float *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalLocalResponseNormGradNHWC(const float *dy, const float *x, const float *y, + const int depth_radius, const float bias, + const float alpha, const float beta, + const size_t channels, const size_t num_elements, + float *scale, float *dx, cudaStream_t cuda_stream); -template void CalLocalResponseNormGradNHWC(const half *dy, const half *x, const half *y, - const int depth_radius, const float bias, const float alpha, const float beta, const size_t channels, - const size_t num_elements, float *scale, half *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalLocalResponseNormGradNHWC(const half *dy, const half *x, const half *y, + const int depth_radius, const float bias, + const float alpha, const float beta, + const size_t channels, const size_t num_elements, + float *scale, half *dx, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh new file mode 100644 index 00000000000..0468a0f3778 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh @@ -0,0 +1,31 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOCAL_RESPONSE_NORM_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOCAL_RESPONSE_NORM_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalLocalResponseNormNHWC(const T *input, const int depth_radius, const float bias, + const float alpha, const float beta, const size_t channels, + const size_t num_elements, float *scale, T *output, + cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CalLocalResponseNormGradNHWC(const T *dy, const T *x, const T *y, const int depth_radius, + const float bias, const float alpha, const float beta, + const size_t channels, const size_t num_elements, float *scale, T *dx, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOCAL_RESPONSE_NORM_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cu similarity index 85% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cu index 592300196a7..84eccc296a7 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cu @@ -16,9 +16,7 @@ #include #include - -#include "plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cuh" template struct LogicalNotFunc { @@ -37,4 +35,4 @@ void LogicalNotImpl(const int &nums, const T *x, bool *y, cudaStream_t stream) { return LogicalNotKernel><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x, y); } -template void LogicalNotImpl(const int &nums, const bool *x, bool *y, cudaStream_t stream); +template CUDA_LIB_EXPORT void LogicalNotImpl(const int &nums, const bool *x, bool *y, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cuh similarity index 59% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cuh index dbf8185b67f..4a5af18bbe8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cuh @@ -14,13 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOGICAL_NOT_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOGICAL_NOT_H_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOGICAL_NOT_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOGICAL_NOT_IMPL_CUH_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void LogicalNotImpl(const int &nums, const T *x, bool *y, cudaStream_t stream); +CUDA_LIB_EXPORT void LogicalNotImpl(const int &nums, const T *x, bool *y, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOGICAL_NOT_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOGICAL_NOT_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cu similarity index 70% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cu index 96b99c5de82..0d2d2c76166 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cu @@ -16,7 +16,6 @@ #include #include "loss_with_reduction_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" #include "util.cuh" inline __device__ float logT(float x) { return logf(x); } @@ -383,62 +382,77 @@ void NLLLossGrad(const int n, const int c, const ReductionMode reduction, const dloss, dinput); } -template void KLDivLoss(const int &input_size, const ReductionMode &reduction, const float *input_x, - const float *input_y, float *loss, float *tmp_loss, cudaStream_t stream); +template CUDA_LIB_EXPORT void KLDivLoss(const int &input_size, const ReductionMode &reduction, + const float *input_x, const float *input_y, float *loss, float *tmp_loss, + cudaStream_t stream); -template void KLDivLossGrad(const int &input_size, const ReductionMode &reduction, const float *input_x, - const float *input_y, const float *dloss, float *dx, float *dy, cudaStream_t stream); +template CUDA_LIB_EXPORT void KLDivLossGrad(const int &input_size, const ReductionMode &reduction, + const float *input_x, const float *input_y, const float *dloss, + float *dx, float *dy, cudaStream_t stream); -template void BinaryCrossEntropyLoss(const int &input_size, const ReductionMode &reduction, const float *input_x, - const float *input_y, const float *weight, float *loss, float *tmp_loss, - cudaStream_t stream); +template CUDA_LIB_EXPORT void BinaryCrossEntropyLoss(const int &input_size, const ReductionMode &reduction, + const float *input_x, const float *input_y, + const float *weight, float *loss, float *tmp_loss, + cudaStream_t stream); -template void BinaryCrossEntropyLossGrad(const int &input_size, const ReductionMode &reduction, - const float *input_x, const float *input_y, const float *weight, - const float *dloss, float *dx, cudaStream_t stream); +template CUDA_LIB_EXPORT void BinaryCrossEntropyLossGrad(const int &input_size, const ReductionMode &reduction, + const float *input_x, const float *input_y, + const float *weight, const float *dloss, float *dx, + cudaStream_t stream); -template void NLLLoss(const int n, const int c, const ReductionMode reduction, const float *input, - const int32_t *target, const float *weight, float *loss, float *total_weight, - float *tmp_loss, float *tmp_target_weight, cudaStream_t stream); +template CUDA_LIB_EXPORT void NLLLoss(const int n, const int c, const ReductionMode reduction, + const float *input, const int32_t *target, const float *weight, + float *loss, float *total_weight, float *tmp_loss, + float *tmp_target_weight, cudaStream_t stream); -template void NLLLoss(const int n, const int c, const ReductionMode reduction, const float *input, - const int32_t *target, const half *weight, float *loss, half *total_weight, - float *tmp_loss, half *tmp_target_weight, cudaStream_t stream); +template CUDA_LIB_EXPORT void NLLLoss(const int n, const int c, const ReductionMode reduction, + const float *input, const int32_t *target, const half *weight, + float *loss, half *total_weight, float *tmp_loss, + half *tmp_target_weight, cudaStream_t stream); -template void NLLLossGrad(const int n, const int c, const ReductionMode reduction, const float *input, - const int32_t *target, const float *weight, const float *total_weight, - const float *dloss, float *dinput, cudaStream_t stream); +template CUDA_LIB_EXPORT void NLLLossGrad(const int n, const int c, const ReductionMode reduction, + const float *input, const int32_t *target, const float *weight, + const float *total_weight, const float *dloss, float *dinput, + cudaStream_t stream); -template void NLLLossGrad(const int n, const int c, const ReductionMode reduction, const float *input, - const int32_t *target, const half *weight, const half *total_weight, - const float *dloss, float *dinput, cudaStream_t stream); +template CUDA_LIB_EXPORT void NLLLossGrad(const int n, const int c, const ReductionMode reduction, + const float *input, const int32_t *target, const half *weight, + const half *total_weight, const float *dloss, float *dinput, + cudaStream_t stream); -template void KLDivLoss(const int &input_size, const ReductionMode &reduction, const half *input_x, - const half *input_y, half *loss, half *tmp_loss, cudaStream_t stream); +template CUDA_LIB_EXPORT void KLDivLoss(const int &input_size, const ReductionMode &reduction, + const half *input_x, const half *input_y, half *loss, half *tmp_loss, + cudaStream_t stream); -template void KLDivLossGrad(const int &input_size, const ReductionMode &reduction, const half *input_x, - const half *input_y, const half *dloss, half *dx, half *dy, cudaStream_t stream); +template CUDA_LIB_EXPORT void KLDivLossGrad(const int &input_size, const ReductionMode &reduction, + const half *input_x, const half *input_y, const half *dloss, half *dx, + half *dy, cudaStream_t stream); -template void BinaryCrossEntropyLoss(const int &input_size, const ReductionMode &reduction, const half *input_x, - const half *input_y, const half *weight, half *loss, half *tmp_loss, - cudaStream_t stream); +template CUDA_LIB_EXPORT void BinaryCrossEntropyLoss(const int &input_size, const ReductionMode &reduction, + const half *input_x, const half *input_y, const half *weight, + half *loss, half *tmp_loss, cudaStream_t stream); -template void BinaryCrossEntropyLossGrad(const int &input_size, const ReductionMode &reduction, - const half *input_x, const half *input_y, const half *weight, - const half *dloss, half *dx, cudaStream_t stream); +template CUDA_LIB_EXPORT void BinaryCrossEntropyLossGrad(const int &input_size, const ReductionMode &reduction, + const half *input_x, const half *input_y, + const half *weight, const half *dloss, half *dx, + cudaStream_t stream); -template void NLLLoss(const int n, const int c, const ReductionMode reduction, const half *input, - const int32_t *target, const half *weight, half *loss, half *total_weight, - half *tmp_loss, half *tmp_target_weight, cudaStream_t stream); +template CUDA_LIB_EXPORT void NLLLoss(const int n, const int c, const ReductionMode reduction, + const half *input, const int32_t *target, const half *weight, + half *loss, half *total_weight, half *tmp_loss, + half *tmp_target_weight, cudaStream_t stream); -template void NLLLoss(const int n, const int c, const ReductionMode reduction, const half *input, - const int32_t *target, const float *weight, half *loss, float *total_weight, - half *tmp_loss, float *tmp_target_weight, cudaStream_t stream); +template CUDA_LIB_EXPORT void NLLLoss(const int n, const int c, const ReductionMode reduction, + const half *input, const int32_t *target, const float *weight, + half *loss, float *total_weight, half *tmp_loss, + float *tmp_target_weight, cudaStream_t stream); -template void NLLLossGrad(const int n, const int c, const ReductionMode reduction, const half *input, - const int32_t *target, const half *weight, const half *total_weight, - const half *dloss, half *dinput, cudaStream_t stream); +template CUDA_LIB_EXPORT void NLLLossGrad(const int n, const int c, const ReductionMode reduction, + const half *input, const int32_t *target, const half *weight, + const half *total_weight, const half *dloss, half *dinput, + cudaStream_t stream); -template void NLLLossGrad(const int n, const int c, const ReductionMode reduction, const half *input, - const int32_t *target, const float *weight, const float *total_weight, - const half *dloss, half *dinput, cudaStream_t stream); +template CUDA_LIB_EXPORT void NLLLossGrad(const int n, const int c, const ReductionMode reduction, + const half *input, const int32_t *target, const float *weight, + const float *total_weight, const half *dloss, half *dinput, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh new file mode 100644 index 00000000000..0b6d2ec00c4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh @@ -0,0 +1,51 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOSS_WITH_REDUCTION_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOSS_WITH_REDUCTION_IMPL_CUH_ +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +enum class ReductionMode { kNone, kMean, kSum }; + +static std::map kReductionModeMap{ + {"none", ReductionMode::kNone}, {"mean", ReductionMode::kMean}, {"sum", ReductionMode::kSum}}; + +template +CUDA_LIB_EXPORT void BinaryCrossEntropyLoss(const int &input_size, const ReductionMode &reduction, const T *input_x, + const T *input_y, const T *weight, T *loss, T *tmp_loss, + cudaStream_t stream); +template +CUDA_LIB_EXPORT void BinaryCrossEntropyLossGrad(const int &input_size, const ReductionMode &reduction, const T *input_x, + const T *input_y, const T *weight, const T *dloss, T *dx, + cudaStream_t stream); +template +CUDA_LIB_EXPORT void KLDivLoss(const int &input_size, const ReductionMode &reduction, const T *input_x, + const T *input_y, T *loss, T *tmp_loss, cudaStream_t stream); +template +CUDA_LIB_EXPORT void KLDivLossGrad(const int &input_size, const ReductionMode &reduction, const T *input_x, + const T *input_y, const T *dloss, T *dx, T *dy, cudaStream_t stream); +template +CUDA_LIB_EXPORT void NLLLoss(const int n, const int c, const ReductionMode reduction, const T *input, + const int32_t *target, const S *weight, T *loss, S *total_weight, T *tmp_loss, + S *tmp_target_weight, cudaStream_t stream); +template +CUDA_LIB_EXPORT void NLLLossGrad(const int n, const int c, const ReductionMode reduction, const T *input, + const int32_t *target, const S *weight, const S *total_weight, const T *dloss, + T *dinput, cudaStream_t stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_LOSS_WITH_REDUCTION_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cu similarity index 59% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cu index 7a0d01d2b27..1cd39a50a5e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cu @@ -16,7 +16,7 @@ #include "matrix_band_part_impl.cuh" #include #include -#include "utils/complex.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" template using Complex = mindspore::utils::Complex; @@ -45,13 +45,15 @@ void MatrixBandPart(const size_t size, const T *input_matrix_addr, const size_t l, u, output_addr, cuda_stream); } -template void MatrixBandPart(const size_t size, const int32_t *input_matrix_addr, const size_t m, - const size_t n, const int64_t l, const int64_t u, int32_t *output_addr, - cudaStream_t cuda_stream); -template void MatrixBandPart(const size_t size, const int64_t *input_matrix_addr, const size_t m, - const size_t n, const int64_t l, const int64_t u, int64_t *output_addr, - cudaStream_t cuda_stream); -template void MatrixBandPart(const size_t size, const float *input_matrix_addr, const size_t m, const size_t n, - const int64_t l, const int64_t u, float *output_addr, cudaStream_t cuda_stream); -template void MatrixBandPart(const size_t size, const double *input_matrix_addr, const size_t m, const size_t n, - const int64_t l, const int64_t u, double *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixBandPart(const size_t size, const int32_t *input_matrix_addr, + const size_t m, const size_t n, const int64_t l, const int64_t u, + int32_t *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixBandPart(const size_t size, const int64_t *input_matrix_addr, + const size_t m, const size_t n, const int64_t l, const int64_t u, + int64_t *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixBandPart(const size_t size, const float *input_matrix_addr, const size_t m, + const size_t n, const int64_t l, const int64_t u, + float *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixBandPart(const size_t size, const double *input_matrix_addr, const size_t m, + const size_t n, const int64_t l, const int64_t u, + double *output_addr, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cuh similarity index 51% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cuh index c8f11f340ce..70bed47d1d5 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cuh @@ -14,12 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_BAND_PART_IMPL_CUH -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_BAND_PART_IMPL_CUH - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_BAND_PART_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_BAND_PART_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void MatrixBandPart(const size_t size, const T *input_matrix_addr, const size_t m, const size_t n, const int64_t l, - const int64_t u, T *output_addr, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void MatrixBandPart(const size_t size, const T *input_matrix_addr, const size_t m, const size_t n, + const int64_t l, const int64_t u, T *output_addr, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_BAND_PART_IMPL_CUH +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_BAND_PART_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cu similarity index 89% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cu index b1bd5fdb695..9e0f10717d7 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cu @@ -66,7 +66,8 @@ void MatrixCombine(const size_t size, const size_t src_height, const size_t src_ return; } -template void MatrixCombine(const size_t size, const size_t src_height, const size_t src_width, - const size_t dst_width, const size_t residual, const size_t res_width, - const size_t batch, float *input_addr, float *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixCombine(const size_t size, const size_t src_height, const size_t src_width, + const size_t dst_width, const size_t residual, + const size_t res_width, const size_t batch, float *input_addr, + float *output_addr, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cuh new file mode 100644 index 00000000000..bbc6c2f9c80 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_combine_impl.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_COMBINE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_COMBINE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void MatrixCombine(const size_t size, const size_t src_height, const size_t src_width, + const size_t dst_width, const size_t residual, const size_t res_width, + const size_t batch, T *input_addr, T *output_addr, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_COMBINE_IMPL_CUH_ + diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cu similarity index 58% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cu index 8eb9dd81cf8..6ba617be014 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cu @@ -16,7 +16,7 @@ #include "matrix_diag_part_impl.cuh" #include #include -#include "utils/complex.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" template using Complex = mindspore::utils::Complex; @@ -60,19 +60,23 @@ void MatrixDiagPart(const size_t size, const T *input_matrix_addr, const size_t size, input_matrix_addr, m, n, l, u, num_diags, max_diag_len, la, ua, padding_value, output_addr, cuda_stream); } -template void MatrixDiagPart(const size_t size, const int32_t *input_matrix_addr, const size_t m, - const size_t n, const int64_t l, const int64_t u, const size_t num_diags, - const size_t max_diag_len, const int64_t la, const int64_t ua, - int32_t *padding_value, int32_t *output_addr, cudaStream_t cuda_stream); -template void MatrixDiagPart(const size_t size, const int64_t *input_matrix_addr, const size_t m, - const size_t n, const int64_t l, const int64_t u, const size_t num_diags, - const size_t max_diag_len, const int64_t la, const int64_t ua, - int64_t *padding_value, int64_t *output_addr, cudaStream_t cuda_stream); -template void MatrixDiagPart(const size_t size, const float *input_matrix_addr, const size_t m, const size_t n, - const int64_t l, const int64_t u, const size_t num_diags, const size_t max_diag_len, - const int64_t la, const int64_t ua, float *padding_value, float *output_addr, - cudaStream_t cuda_stream); -template void MatrixDiagPart(const size_t size, const double *input_matrix_addr, const size_t m, const size_t n, - const int64_t l, const int64_t u, const size_t num_diags, - const size_t max_diag_len, const int64_t la, const int64_t ua, - double *padding_value, double *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixDiagPart(const size_t size, const int32_t *input_matrix_addr, + const size_t m, const size_t n, const int64_t l, const int64_t u, + const size_t num_diags, const size_t max_diag_len, + const int64_t la, const int64_t ua, int32_t *padding_value, + int32_t *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixDiagPart(const size_t size, const int64_t *input_matrix_addr, + const size_t m, const size_t n, const int64_t l, const int64_t u, + const size_t num_diags, const size_t max_diag_len, + const int64_t la, const int64_t ua, int64_t *padding_value, + int64_t *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixDiagPart(const size_t size, const float *input_matrix_addr, const size_t m, + const size_t n, const int64_t l, const int64_t u, + const size_t num_diags, const size_t max_diag_len, const int64_t la, + const int64_t ua, float *padding_value, float *output_addr, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixDiagPart(const size_t size, const double *input_matrix_addr, const size_t m, + const size_t n, const int64_t l, const int64_t u, + const size_t num_diags, const size_t max_diag_len, + const int64_t la, const int64_t ua, double *padding_value, + double *output_addr, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cuh new file mode 100644 index 00000000000..7a6ede2931f --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_DIAG_PART_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_DIAG_PART_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void MatrixDiagPart(const size_t size, const T *input_matrix_addr, const size_t m, const size_t n, + const int64_t l, const int64_t u, const size_t num_diags, const size_t max_diag_len, + const int64_t la, const int64_t ua, T *padding_value, T *output_addr, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_DIAG_PART_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cu similarity index 57% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cu index 01d3c9c1427..28937ec55b9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cu @@ -67,26 +67,29 @@ void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_c return; } -template void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, const int num_diags, - const int max_diag_len, const int lower_index, const int upper_index, - const bool right_align_super_diagonal, const bool right_align_sub_diagonal, - const bool is_single_diag, const int *diag_addr, int *output_addr, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, + const int num_diags, const int max_diag_len, const int lower_index, + const int upper_index, const bool right_align_super_diagonal, + const bool right_align_sub_diagonal, const bool is_single_diag, + const int *diag_addr, int *output_addr, cudaStream_t cuda_stream); -template void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, - const int num_diags, const int max_diag_len, const int lower_index, - const int upper_index, const bool right_align_super_diagonal, - const bool right_align_sub_diagonal, const bool is_single_diag, - const int64_t *diag_addr, int64_t *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, + const int num_diags, const int max_diag_len, const int lower_index, + const int upper_index, const bool right_align_super_diagonal, + const bool right_align_sub_diagonal, const bool is_single_diag, + const int64_t *diag_addr, int64_t *output_addr, + cudaStream_t cuda_stream); -template void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, const int num_diags, - const int max_diag_len, const int lower_index, const int upper_index, - const bool right_align_super_diagonal, const bool right_align_sub_diagonal, - const bool is_single_diag, const float *diag_addr, float *output_addr, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, + const int num_diags, const int max_diag_len, const int lower_index, + const int upper_index, const bool right_align_super_diagonal, + const bool right_align_sub_diagonal, const bool is_single_diag, + const float *diag_addr, float *output_addr, + cudaStream_t cuda_stream); -template void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, - const int num_diags, const int max_diag_len, const int lower_index, - const int upper_index, const bool right_align_super_diagonal, - const bool right_align_sub_diagonal, const bool is_single_diag, - const double *diag_addr, double *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, + const int num_diags, const int max_diag_len, const int lower_index, + const int upper_index, const bool right_align_super_diagonal, + const bool right_align_sub_diagonal, const bool is_single_diag, + const double *diag_addr, double *output_addr, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cuh new file mode 100644 index 00000000000..d6fdd692512 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cuh @@ -0,0 +1,27 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SET_DIAG_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SET_DIAG_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, const int num_diags, + const int max_diag_len, const int lower_index, const int upper_index, + const bool right_align_super_diagonal, const bool right_align_sub_diagonal, + const bool is_single_diag, const T *diag_addr, T *output_addr, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SET_DIAG_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cu similarity index 86% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cu index b5ddb4e4835..23fbb71f198 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cu @@ -65,8 +65,8 @@ void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, T return; } -template void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, float *input_addr, - float *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, + float *input_addr, float *output_addr, cudaStream_t cuda_stream); -template void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, double *input_addr, - double *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, + double *input_addr, double *output_addr, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cuh similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cuh index 3e2a808e08e..16281c6381b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cuh @@ -14,12 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXSPLIT_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXSPLIT_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SPLIT_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SPLIT_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, T *output_addr, - cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, + T *output_addr, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXSPLIT_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MATRIX_SPLIT_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cu similarity index 60% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cu index 3f1489f3c76..8f02f465b3b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cu @@ -16,9 +16,8 @@ #include #include "maxpool_with_argmax_grad_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" #include "include/cuda_fp16.h" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template __global__ void MaxPoolWithArgmaxGrad(const T* dy, @@ -75,23 +74,23 @@ void CalMaxPoolWithArgmaxGrad(const T* dy, return; } -template void CalMaxPoolWithArgmaxGrad(const float* dy, - const int* index, - const int n, - const int c, - const int xHeight, - const int xWidth, - const int dyHeight, - const int dyWidth, - float* dx, - cudaStream_t cuda_stream); -template void CalMaxPoolWithArgmaxGrad(const half* dy, - const int* index, - const int n, - const int c, - const int xHeight, - const int xWidth, - const int dyHeight, - const int dyWidth, - half* dx, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalMaxPoolWithArgmaxGrad(const float* dy, + const int* index, + const int n, + const int c, + const int xHeight, + const int xWidth, + const int dyHeight, + const int dyWidth, + float* dx, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalMaxPoolWithArgmaxGrad(const half* dy, + const int* index, + const int n, + const int c, + const int xHeight, + const int xWidth, + const int dyHeight, + const int dyWidth, + half* dx, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cuh new file mode 100644 index 00000000000..bb045c883e0 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cuh @@ -0,0 +1,25 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalMaxPoolWithArgmaxGrad(const T* dy, const S* index, const int n, const int c, const int xHeight, + const int xWidth, const int dyHeight, const int dyWidth, T* dx, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cu similarity index 62% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cu index b4dd206b2a5..201b974f847 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cu @@ -16,7 +16,6 @@ #include #include "maxpool_with_argmax_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" #include "include/cuda_fp16.h" template __global__ void MaxPoolWithArgmax(const T* input, @@ -112,36 +111,36 @@ void CalMaxPoolWithArgmax(const T* input, index); } -template void CalMaxPoolWithArgmax(const float* input, - const int n, - const int c, - const int h, - const int w, - const int windowHeight, - const int windowWidth, - const int strideHeight, - const int strideWidth, - const int padTop, - const int padLeft, - const int outputHeight, - const int outputWidth, - float* output, - int* index, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalMaxPoolWithArgmax(const float* input, + const int n, + const int c, + const int h, + const int w, + const int windowHeight, + const int windowWidth, + const int strideHeight, + const int strideWidth, + const int padTop, + const int padLeft, + const int outputHeight, + const int outputWidth, + float* output, + int* index, + cudaStream_t cuda_stream); -template void CalMaxPoolWithArgmax(const half* input, - const int n, - const int c, - const int h, - const int w, - const int windowHeight, - const int windowWidth, - const int strideHeight, - const int strideWidth, - const int padTop, - const int padLeft, - const int outputHeight, - const int outputWidth, - half* output, - int* index, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalMaxPoolWithArgmax(const half* input, + const int n, + const int c, + const int h, + const int w, + const int windowHeight, + const int windowWidth, + const int strideHeight, + const int strideWidth, + const int padTop, + const int padLeft, + const int outputHeight, + const int outputWidth, + half* output, + int* index, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cuh new file mode 100644 index 00000000000..24b8afc8cdb --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cuh @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalMaxPoolWithArgmax(const T* input, const int n, const int c, const int h, const int w, + const int windowHeight, const int windowWidth, const int strideHeight, + const int strideWidth, const int padTop, const int padLeft, + const int outputHeight, const int outputWidth, T* output, S *index, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MAXPOOL_WITH_ARGMAX_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cu similarity index 98% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cu index fb62fb413ca..f1cff4f10fd 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cu @@ -20,7 +20,6 @@ #include #include #include "minmax_update_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" __global__ void UpdateInputMinMaxPerLayerWithEMA(const float *input_min, const float *input_max, float *output_min, float *output_max, const float min, const float max, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh new file mode 100644 index 00000000000..828ec9fa7dc --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh @@ -0,0 +1,29 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MINMAX_UPDATE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MINMAX_UPDATE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +CUDA_LIB_EXPORT void CalMinMaxPerChannel(float *input, float *input_min, float *input_max, float *output_min, + float *output_max, const int total_num, const int channel_num, + const float ema_decay, const bool ema, cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT void CalMinMaxPerLayer(float *input, float *input_min, float *input_max, float *output_min, + float *output_max, const int size, const float ema_decay, const bool ema, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MINMAX_UPDATE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cu similarity index 79% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cu index 65e88876fe6..ad5170ac1e1 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cu @@ -16,7 +16,8 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh" +#include "include/cuda_fp16.h" // check for existence in current padded array on X and Y dims __inline__ __device__ bool range_check(int x, int y, int padded_width, int padded_height) { @@ -252,27 +253,33 @@ void CalMirrorPadGrad(const size_t dx_size, const size_t interim_dy_size, T *dy, mode, dx); } -template void CalMirrorPad(const size_t size, const float *input, const int old_batch, const int old_channel, - const int old_height, const int old_width, const int padded_height, - const int padded_width, int padd_num, const int64_t *paddings, int mode, - float *output, cudaStream_t cuda_stream); -template void CalMirrorPad(const size_t size, const half *input, const int old_batch, const int old_channel, - const int old_height, const int old_width, const int padded_height, - const int padded_width, int padd_num, const int64_t *paddings, int mode, half *output, - cudaStream_t cuda_stream); -template void CalMirrorPad(const size_t size, const int *input, const int old_batch, const int old_channel, - const int old_height, const int old_width, const int padded_height, - const int padded_width, int padd_num, const int64_t *paddings, int mode, int *output, - cudaStream_t cuda_stream); -template void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, float *dy, float *interim_dy, - const int dx_batches, const int dx_channels, const int dx_height, - const int dx_width, const int dy_height, const int dy_width, const int padd_dim, - const int64_t *paddings, int mode, float *dx, cudaStream_t cuda_stream); -template void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, half *dy, half *interim_dy, - const int dx_batches, const int dx_channels, const int dx_height, - const int dx_width, const int dy_height, const int dy_width, const int padd_dim, - const int64_t *paddings, int mode, half *dx, cudaStream_t cuda_stream); -template void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, int *dy, int *interim_dy, - const int dx_batches, const int dx_channels, const int dx_height, - const int dx_width, const int dy_height, const int dy_width, const int padd_dim, - const int64_t *paddings, int mode, int *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalMirrorPad(const size_t size, const float *input, const int old_batch, + const int old_channel, const int old_height, const int old_width, + const int padded_height, const int padded_width, int padd_num, + const int64_t *paddings, int mode, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalMirrorPad(const size_t size, const half *input, const int old_batch, + const int old_channel, const int old_height, const int old_width, + const int padded_height, const int padded_width, int padd_num, + const int64_t *paddings, int mode, half *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalMirrorPad(const size_t size, const int *input, const int old_batch, + const int old_channel, const int old_height, const int old_width, + const int padded_height, const int padded_width, int padd_num, + const int64_t *paddings, int mode, int *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, float *dy, + float *interim_dy, const int dx_batches, const int dx_channels, + const int dx_height, const int dx_width, const int dy_height, + const int dy_width, const int padd_dim, const int64_t *paddings, + int mode, float *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, half *dy, + half *interim_dy, const int dx_batches, const int dx_channels, + const int dx_height, const int dx_width, const int dy_height, + const int dy_width, const int padd_dim, const int64_t *paddings, + int mode, half *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, int *dy, + int *interim_dy, const int dx_batches, const int dx_channels, + const int dx_height, const int dx_width, const int dy_height, + const int dy_width, const int padd_dim, const int64_t *paddings, + int mode, int *dx, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh new file mode 100755 index 00000000000..6a4e705f97a --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh @@ -0,0 +1,48 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MIRROR_PAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MIRROR_PAD_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +// preset size of paddings +#define MAX_PADDINGS 4 +#define PADDING_SIZE 2 + +// define constants for kernel indexing use +#define BATCH 0 * PADDING_SIZE +#define CHANNEL 1 * PADDING_SIZE +#define HEIGHT 2 * PADDING_SIZE +#define WIDTH 3 * PADDING_SIZE +#define TOP 0 +#define BOTTOM 1 +#define LEFT 0 +#define RIGHT 1 + +template +CUDA_LIB_EXPORT void CalMirrorPad(const size_t size, const T *input, const int old_batch, const int old_channel, + const int old_height, const int old_width, const int padded_height, + const int padded_width, int padd_num, const int64_t *paddings, int mode, T *output, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, T *dy, T *interim, + const int output_batch, const int output_channel, const int output_height, + const int output_width, const int input_height, const int input_width, + const int padd_dim, const int64_t *paddings, int mode, T *dx, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MIRROR_PAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cu similarity index 62% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cu index 8c4934a32b4..b7d2d3db7f3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cu @@ -15,6 +15,7 @@ */ #include "momentum_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient, const S *momentum, bool use_nesterov) { @@ -175,52 +176,64 @@ void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, co num, element_num, weight_decay, scale, variable, accumulation, learning_rate, gradient, momentum); } // end CombineFusedWeightDecayScaleMomentum -template void MomentumUpdateVariable(const size_t size, float *variable, float *accumulation, - const float *learning_rate, const float *gradient, - const float *momentum, bool use_nesterov, - cudaStream_t cuda_stream); -template void MomentumUpdateVariable(const size_t size, half *variable, half *accumulation, - const half *learning_rate, const half *gradient, - const half *momentum, bool use_nesterov, - cudaStream_t cuda_stream); -template void MomentumUpdateVariable(const size_t size, half *variable, half *accumulation, - const float *learning_rate, const half *gradient, - const float *momentum, bool use_nesterov, - cudaStream_t cuda_stream); -template void MomentumUpdateVariable(const size_t size, float *variable, float *accumulation, - const float *learning_rate, const half *gradient, - const float *momentum, bool use_nesterov, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MomentumUpdateVariable(const size_t size, float *variable, + float *accumulation, + const float *learning_rate, + const float *gradient, const float *momentum, + bool use_nesterov, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MomentumUpdateVariable(const size_t size, half *variable, + half *accumulation, const half *learning_rate, + const half *gradient, const half *momentum, + bool use_nesterov, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MomentumUpdateVariable(const size_t size, half *variable, + half *accumulation, const float *learning_rate, + const half *gradient, const float *momentum, + bool use_nesterov, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MomentumUpdateVariable(const size_t size, float *variable, + float *accumulation, + const float *learning_rate, + const half *gradient, const float *momentum, + bool use_nesterov, cudaStream_t cuda_stream); -template void FusedWeightDecayScaleMomentum(const size_t element_num, float *weight_decay, float *scale, - float *variable, float *accumulation, const float *learning_rate, - const float *gradient, const float *momentum, cudaStream_t cuda_stream); -template void FusedWeightDecayScaleMomentum(const size_t element_num, float *weight_decay, float *scale, - float *variable, float *accumulation, const float *learning_rate, - const half *gradient, const float *momentum, cudaStream_t cuda_stream); -template void FusedWeightDecayMomentum(const size_t element_num, float *weight_decay, float *variable, - float *accumulation, const float *learning_rate, const float *gradient, - const float *momentum, cudaStream_t cuda_stream); -template void FusedWeightDecayMomentum(const size_t element_num, float *weight_decay, float *variable, - float *accumulation, const float *learning_rate, const half *gradient, - const float *momentum, cudaStream_t cuda_stream); -template void FusedScaleMomentum(const size_t element_num, float *scale, float *variable, float *accumulation, - const float *learning_rate, const float *gradient, const float *momentum, - cudaStream_t cuda_stream); -template void FusedScaleMomentum(const size_t element_num, float *scale, float *variable, float *accumulation, - const float *learning_rate, const half *gradient, const float *momentum, - cudaStream_t cuda_stream); -template void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *elements, - float **weight_decay, float **scale, float **variable, - float **accumulation, float **learning_rate, float **gradient, - float **momentum, cudaStream_t cuda_stream); -template void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *elements, - float **weight_decay, float **scale, float **variable, - float **accumulation, float **learning_rate, half **gradient, - float **momentum, cudaStream_t cuda_stream); -template void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements, float **scale, - float **variable, float **accumulation, float **learning_rate, float **gradient, - float **momentum, cudaStream_t cuda_stream); -template void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements, float **scale, - float **variable, float **accumulation, float **learning_rate, half **gradient, - float **momentum, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FusedWeightDecayScaleMomentum(const size_t element_num, float *weight_decay, float *scale, + float *variable, float *accumulation, + const float *learning_rate, const float *gradient, + const float *momentum, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FusedWeightDecayScaleMomentum(const size_t element_num, float *weight_decay, float *scale, + float *variable, float *accumulation, + const float *learning_rate, const half *gradient, + const float *momentum, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FusedWeightDecayMomentum(const size_t element_num, float *weight_decay, float *variable, + float *accumulation, const float *learning_rate, + const float *gradient, const float *momentum, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FusedWeightDecayMomentum(const size_t element_num, float *weight_decay, float *variable, + float *accumulation, const float *learning_rate, + const half *gradient, const float *momentum, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FusedScaleMomentum(const size_t element_num, float *scale, float *variable, + float *accumulation, const float *learning_rate, const float *gradient, + const float *momentum, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FusedScaleMomentum(const size_t element_num, float *scale, float *variable, + float *accumulation, const float *learning_rate, const half *gradient, + const float *momentum, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, + const size_t *elements, float **weight_decay, + float **scale, float **variable, + float **accumulation, float **learning_rate, + float **gradient, float **momentum, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, + const size_t *elements, float **weight_decay, + float **scale, float **variable, + float **accumulation, float **learning_rate, + half **gradient, float **momentum, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements, + float **scale, float **variable, float **accumulation, + float **learning_rate, float **gradient, float **momentum, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements, + float **scale, float **variable, float **accumulation, + float **learning_rate, half **gradient, float **momentum, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh new file mode 100644 index 00000000000..df9d40393c8 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh @@ -0,0 +1,45 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MOMENTUM_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MOMENTUM_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, + const G *gradient, const S *momentum, bool use_nesterov, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void FusedWeightDecayScaleMomentum(const size_t element_num, T *weight_decay, T *scale, T *variable, + T *accumulation, const T *learning_rate, const S *gradient, + const T *momentum, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void FusedWeightDecayMomentum(const size_t element_num, T *weight_decay, T *variable, T *accumulation, + const T *learning_rate, const S *gradient, const T *momentum, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void FusedScaleMomentum(const size_t element_num, T *scale, T *variable, T *accumulation, + const T *learning_rate, const S *gradient, const T *momentum, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *element, + T **weight_decay, T **scale, T **variable, T **accumulation, + T **learning_rate, S **gradient, T **momentum, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *element, T **scale, + T **variable, T **accumulation, T **learning_rate, S **gradient, + T **momentum, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MOMENTUM_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cu similarity index 86% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cu index f69cbb3d653..7ad4ebc91c1 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cu @@ -136,8 +136,9 @@ void Multinomial(int row, int col, T *probs, curandState *state, int64_t *num_sa MultinomialKernel<<>>(row, col, probs, state, num_sample, output); } -template void Multinomial(int row, int col, float *probs, curandState *state, int64_t *num_sample, int *output, - cudaStream_t stream); -template void CheckNonNeg(const size_t size, const float *input, float *output, cudaStream_t cuda_stream); -template void CheckZero(const size_t distributions, const size_t categories, const float *input, float *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Multinomial(int row, int col, float *probs, curandState *state, + int64_t *num_sample, int *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void CheckNonNeg(const size_t size, const float *input, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CheckZero(const size_t distributions, const size_t categories, const float *input, + float *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cuh new file mode 100644 index 00000000000..bfd82d2392d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cuh @@ -0,0 +1,32 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MULTINOMIAL_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MULTINOMIAL_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +CUDA_LIB_EXPORT void InitRandState(int seed, int num, curandState *state, cudaStream_t stream); +template +CUDA_LIB_EXPORT void Multinomial(int row, int col, T *probs, curandState *rand_state, int64_t *num_sample, int *output, + cudaStream_t stream); +template +CUDA_LIB_EXPORT void CheckNonNeg(const size_t size, const T *input, T *output, cudaStream_t stream); +template +CUDA_LIB_EXPORT void CheckZero(const size_t distributions, const size_t categories, const T *input, T *output, + cudaStream_t stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_MULTINOMIAL_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cu similarity index 92% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cu index 413c5f0d4a9..08d9044852d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cu @@ -198,11 +198,12 @@ void CalNms(const int num, const float IOU_value, T *output, bool *sel_boxes, in ReducePass<<<1, GET_THREADS, 0, cuda_stream>>>(num, sel_boxes, row_mask); } -template void CalSort(const int &inner, float *data_in, float *data_out, int *index_buff, float *data_buff, - int box_size, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalSort(const int &inner, float *data_in, float *data_out, int *index_buff, + float *data_buff, int box_size, cudaStream_t stream); -template void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, float *input, float *output, - int *index_buff, int box_size, bool *row_mask, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, float *input, + float *output, int *index_buff, int box_size, bool *row_mask, + cudaStream_t cuda_stream); -template void CalNms(const int num, const float IOU_value, float *output, bool *sel_boxes, int box_size, - bool *row_mask, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNms(const int num, const float IOU_value, float *output, bool *sel_boxes, + int box_size, bool *row_mask, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cuh new file mode 100644 index 00000000000..066a4207b98 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cuh @@ -0,0 +1,35 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_NMS_WITH_MASK_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_NMS_WITH_MASK_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void CalSort(const int &inner, T *data_in, T *data_out, int *index_buff, T *data_buff, int box_size_, + cudaStream_t stream); + +template +CUDA_LIB_EXPORT void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *input, T *output, int *index_buff, + int box_size_, bool *row_mask, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CalNms(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size_, + bool *row_mask, cudaStream_t cuda_stream); + +CUDA_LIB_EXPORT int NmsRoundUpPower2(int v); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_NMS_WITH_MASK_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cu similarity index 61% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cu index 9aada57e6c4..1336cf1231e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cu @@ -15,7 +15,7 @@ */ #include "one_hot_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "include/cuda_fp16.h" template __global__ void OneHotKernel(size_t size, const S *indices, size_t depth, const T *on_value, const T *off_value, size_t left_dim_size, size_t right_dim_size, T *output) { @@ -45,13 +45,15 @@ void OneHot(const S *indices, size_t depth, const T *on_value, const T *off_valu left_dim_size, right_dim_size, output); return; } -template void OneHot(const int *indices, size_t depth, const float *on_value, const float *off_value, - size_t left_dim_size, size_t right_dim_size, float *output, cudaStream_t cuda_stream); -template void OneHot(const int *indices, size_t depth, const half *on_value, const half *off_value, - size_t left_dim_size, size_t right_dim_size, half *output, cudaStream_t cuda_stream); -template void OneHot(const int64_t *indices, size_t depth, const float *on_value, - const float *off_value, size_t left_dim_size, size_t right_dim_size, float *output, - cudaStream_t cuda_stream); -template void OneHot(const int64_t *indices, size_t depth, const half *on_value, const half *off_value, - size_t left_dim_size, size_t right_dim_size, half *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void OneHot(const int *indices, size_t depth, const float *on_value, + const float *off_value, size_t left_dim_size, size_t right_dim_size, + float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void OneHot(const int *indices, size_t depth, const half *on_value, + const half *off_value, size_t left_dim_size, size_t right_dim_size, + half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void OneHot(const int64_t *indices, size_t depth, const float *on_value, + const float *off_value, size_t left_dim_size, + size_t right_dim_size, float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void OneHot(const int64_t *indices, size_t depth, const half *on_value, + const half *off_value, size_t left_dim_size, size_t right_dim_size, + half *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cuh similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cuh index 5b5991256ec..65eaaa4d46e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cuh @@ -14,10 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONE_HOT_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONE_HOT_IMPL_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONE_HOT_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONE_HOT_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void OneHot(const S *indices, size_t depth_, const T *on_value, const T *off_value, size_t left_dim_size, - size_t right_dim_size, T *output, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void OneHot(const S *indices, size_t depth_, const T *on_value, const T *off_value, + size_t left_dim_size, size_t right_dim_size, T *output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONE_HOT_IMPL_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONE_HOT_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cu new file mode 100644 index 00000000000..ad510fac826 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cu @@ -0,0 +1,56 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "oneslike_impl.cuh" +#include "include/cuda_fp16.h" +template +__global__ void OnesLike(const size_t size, const T* input, T* output) { + int one = 1; + T val = static_cast(one); + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { + output[pos] = val; + } + return; +} +template +void CalOnesLike(const size_t size, const T* input, T* output, cudaStream_t cuda_stream) { + OnesLike<<>>(size, input, output); + return; +} + +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const double* input, double* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const float* input, float* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const half* input, half* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const int8_t* input, int8_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const int16_t* input, int16_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const int32_t* input, int32_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const int64_t* input, int64_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const uint8_t* input, uint8_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const uint16_t* input, uint16_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const uint32_t* input, uint32_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const uint64_t* input, uint64_t* output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cuh similarity index 59% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cuh index 21bd995fb7b..88d62d30a34 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cuh @@ -14,10 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONESLIKE_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONESLIKE_H_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONESLIKE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONESLIKE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalOnesLike(const size_t size, const T* input, T* output, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalOnesLike(const size_t size, const T* input, T* output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ONESLIKE_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ONESLIKE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cu new file mode 100755 index 00000000000..0cb05efef20 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cu @@ -0,0 +1,74 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh" +#include "include/cuda_fp16.h" +template +__global__ void Pack(const size_t size, const size_t input_num, const size_t dims_behind_axis, T** inputs, T* output) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { + size_t cur_input_index = pos / dims_behind_axis % input_num; + size_t cycle_len = input_num * dims_behind_axis; + size_t local_index = pos / cycle_len * dims_behind_axis + pos % cycle_len % dims_behind_axis; + output[pos] = inputs[cur_input_index][local_index]; + } + return; +} + +template +void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, T** inputs, T* output, + cudaStream_t cuda_stream) { + Pack<<>>(size, input_num, dims_behind_axis, inputs, output); + return; +} + + +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, int8_t** inputs, int8_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, int16_t** inputs, int16_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, int** inputs, int* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, int64_t** inputs, int64_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, uint8_t** inputs, uint8_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, uint16_t** inputs, uint16_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, uint32_t** inputs, uint32_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, uint64_t** inputs, uint64_t* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, half** inputs, half* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, float** inputs, float* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void PackKernel(const size_t size, const size_t input_num, + const size_t dims_behind_axis, bool** inputs, bool* output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh similarity index 50% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh index a74c125d6a9..0838eea47b4 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh @@ -14,15 +14,14 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_PACK_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_PACK_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PACK_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PACK_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void PackKernel(const size_t size, - const size_t input_num, - const size_t dims_behind_axis, - T** inputs, - T* output, - cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_PACK_H_ +CUDA_LIB_EXPORT void PackKernel(const size_t size, + const size_t input_num, + const size_t dims_behind_axis, + T** inputs, + T* output, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PACK_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cu similarity index 65% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cu index e6ac41ea9c6..32262056c38 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cu @@ -16,7 +16,8 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" +#include "include/cuda_fp16.h" // For internal OP use, not user facing template @@ -268,72 +269,77 @@ void CalPadGradNDHWC(const size_t size, const T *dy, const int num, const int ol pad_head, pad_top, pad_left, dx); } -template void CalPad(const size_t size, const float* input, const int num, const int channels, - const int old_height, const int old_width, const int padded_height, const int padded_width, - const int pad_top, const int pad_left, float pad_value, float* output, - cudaStream_t cuda_stream); -template void CalPadGrad(const size_t size, const float* dy, const int num, const int channels, - const int old_height, const int old_width, const int padded_height, - const int padded_width, const int pad_top, const int pad_left, float* dx, - cudaStream_t cuda_stream); -template void CalPad(const size_t size, const half* input, const int num, const int channels, - const int old_height, const int old_width, const int padded_height, const int padded_width, - const int pad_top, const int pad_left, float pad_value, half* output, - cudaStream_t cuda_stream); -template void CalPadGrad(const size_t size, const half* dy, const int num, const int channels, - const int old_height, const int old_width, const int padded_height, - const int padded_width, const int pad_top, const int pad_left, half* dx, - cudaStream_t cuda_stream); -template void CalPadNHWC(const size_t size, const float* input, const int num, const int old_height, - const int old_width, const int channels, const int padded_height, - const int padded_width, const int pad_top, const int pad_left, float pad_value, - float* output, cudaStream_t cuda_stream); -template void CalPadNHWC(const size_t size, const half* input, const int num, const int old_height, - const int old_width, const int channels, const int padded_height, - const int padded_width, const int pad_top, const int pad_left, float pad_value, - half* output, cudaStream_t cuda_stream); -template void CalPadGradNHWC(const size_t size, const float* dy, const int num, const int old_height, - const int old_width, const int channels, const int padded_height, - const int padded_width, const int pad_top, const int pad_left, float* dx, - cudaStream_t cuda_stream); -template void CalPadGradNHWC(const size_t size, const half* dy, const int num, const int old_height, - const int old_width, const int channels, const int padded_height, - const int padded_width, const int pad_top, const int pad_left, half* dx, - cudaStream_t cuda_stream); -template void CalPadGeneral(const float *input, float *output, const size_t *input_shape, const size_t *strides, - const int *paddings, const int input_size, const size_t input_rank, - cudaStream_t cuda_stream); -template void CalPadGeneral(const half *input, half *output, const size_t *input_shape, const size_t *strides, - const int *paddings, const int input_size, const size_t input_rank, - cudaStream_t cuda_stream); -template void CalPadGeneral(const int *input, int *output, const size_t *input_shape, const size_t *strides, - const int *paddings, const int input_size, const size_t input_rank, - cudaStream_t cuda_stream); -template void CalPad3d(const size_t size, const float* input, const int num, const int channels, - const int old_depth, const int old_height, const int old_width, const int padded_depth, - const int padded_height, const int padded_width, const int pad_head, const int pad_top, - const int pad_left, const float pad_value, float* output, cudaStream_t cuda_stream); -template void CalPad3d(const size_t size, const half* input, const int num, const int channels, - const int old_depth, const int old_height, const int old_width, const int padded_depth, - const int padded_height, const int padded_width, const int pad_head, const int pad_top, - const int pad_left, const float pad_value, half* output, cudaStream_t cuda_stream); -template void CalPadGrad3d(const size_t size, const float* dy, const int num, const int channels, - const int old_depth, const int old_height, const int old_width, - const int padded_depth, const int padded_height, const int padded_width, - const int pad_head, const int pad_top, const int pad_left, float* dx, - cudaStream_t cuda_stream); -template void CalPadGrad3d(const size_t size, const half* dy, const int num, const int channels, - const int old_depth, const int old_height, const int old_width, - const int padded_depth, const int padded_height, const int padded_width, - const int pad_head, const int pad_top, const int pad_left, half* dx, - cudaStream_t cuda_stream); -template void CalPadGradNDHWC(const size_t size, const float *dy, const int num, const int old_depth, - const int old_height, const int old_width, const int channels, - const int padded_depth, const int padded_height, const int padded_width, - const int pad_head, const int pad_top, const int pad_left, float *dx, - cudaStream_t cuda_stream); -template void CalPadGradNDHWC(const size_t size, const half *dy, const int num, const int old_depth, - const int old_height, const int old_width, const int channels, - const int padded_depth, const int padded_height, const int padded_width, - const int pad_head, const int pad_top, const int pad_left, half *dx, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPad(const size_t size, const float* input, const int num, const int channels, + const int old_height, const int old_width, const int padded_height, + const int padded_width, const int pad_top, const int pad_left, + float pad_value, float* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGrad(const size_t size, const float* dy, const int num, const int channels, + const int old_height, const int old_width, const int padded_height, + const int padded_width, const int pad_top, const int pad_left, + float* dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPad(const size_t size, const half* input, const int num, const int channels, + const int old_height, const int old_width, const int padded_height, + const int padded_width, const int pad_top, const int pad_left, + float pad_value, half* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGrad(const size_t size, const half* dy, const int num, const int channels, + const int old_height, const int old_width, const int padded_height, + const int padded_width, const int pad_top, const int pad_left, half* dx, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadNHWC(const size_t size, const float* input, const int num, + const int old_height, const int old_width, const int channels, + const int padded_height, const int padded_width, const int pad_top, + const int pad_left, float pad_value, float* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadNHWC(const size_t size, const half* input, const int num, + const int old_height, const int old_width, const int channels, + const int padded_height, const int padded_width, const int pad_top, + const int pad_left, float pad_value, half* output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGradNHWC(const size_t size, const float* dy, const int num, + const int old_height, const int old_width, const int channels, + const int padded_height, const int padded_width, const int pad_top, + const int pad_left, float* dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGradNHWC(const size_t size, const half* dy, const int num, + const int old_height, const int old_width, const int channels, + const int padded_height, const int padded_width, const int pad_top, + const int pad_left, half* dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGeneral(const float *input, float *output, const size_t *input_shape, + const size_t *strides, const int *paddings, const int input_size, + const size_t input_rank, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGeneral(const half *input, half *output, const size_t *input_shape, + const size_t *strides, const int *paddings, const int input_size, + const size_t input_rank, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGeneral(const int *input, int *output, const size_t *input_shape, + const size_t *strides, const int *paddings, const int input_size, + const size_t input_rank, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPad3d(const size_t size, const float* input, const int num, const int channels, + const int old_depth, const int old_height, const int old_width, + const int padded_depth, const int padded_height, const int padded_width, + const int pad_head, const int pad_top, const int pad_left, + const float pad_value, float* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPad3d(const size_t size, const half* input, const int num, const int channels, + const int old_depth, const int old_height, const int old_width, + const int padded_depth, const int padded_height, const int padded_width, + const int pad_head, const int pad_top, const int pad_left, + const float pad_value, half* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGrad3d(const size_t size, const float* dy, const int num, const int channels, + const int old_depth, const int old_height, const int old_width, + const int padded_depth, const int padded_height, + const int padded_width, const int pad_head, const int pad_top, + const int pad_left, float* dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGrad3d(const size_t size, const half* dy, const int num, const int channels, + const int old_depth, const int old_height, const int old_width, + const int padded_depth, const int padded_height, + const int padded_width, const int pad_head, const int pad_top, + const int pad_left, half* dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGradNDHWC(const size_t size, const float *dy, const int num, + const int old_depth, const int old_height, const int old_width, + const int channels, const int padded_depth, + const int padded_height, const int padded_width, + const int pad_head, const int pad_top, const int pad_left, + float *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalPadGradNDHWC(const size_t size, const half *dy, const int num, + const int old_depth, const int old_height, const int old_width, + const int channels, const int padded_depth, const int padded_height, + const int padded_width, const int pad_head, const int pad_top, + const int pad_left, half *dx, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh new file mode 100644 index 00000000000..081a98c1523 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh @@ -0,0 +1,66 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PAD_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void CalPad(const size_t size, const T* input, const int num, const int channels, const int old_height, + const int old_width, const int padded_height, const int padded_width, const int pad_top, + const int pad_left, float pad_value, T* output, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalPadGrad(const size_t size, const T* dy, const int num, const int channels, const int old_height, + const int old_width, const int padded_height, const int padded_width, const int pad_top, + const int pad_left, T* dx, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalPadNHWC(const size_t size, const T* input, const int num, const int old_height, + const int old_width, const int channels, const int padded_height, + const int padded_width, const int pad_top, const int pad_left, float pad_value, + T* output, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalPadGradNHWC(const size_t size, const T* input, const int num, const int old_height, + const int old_width, const int channels, const int padded_height, + const int padded_width, const int pad_top, const int pad_left, T* output, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalPadGeneral(const T *input, T *output, const size_t *input_shape, const size_t *strides, + const int *paddings, const int input_size, const size_t input_rank, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalPad3d(const size_t size, const T* input, const int num, const int channels, const int old_depth, + const int old_height, const int old_width, const int padded_depth, + const int padded_height, const int padded_width, const int pad_head, const int pad_top, + const int pad_left, const float pad_value, T* output, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalPadGrad3d(const size_t size, const T* dy, const int num, const int channels, + const int old_depth, const int old_height, const int old_width, + const int padded_depth, const int padded_height, const int padded_width, + const int pad_head, const int pad_top, const int pad_left, T* dx, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalPadNDHWC(const size_t size, const T *input, const int num, const int old_depth, + const int old_height, const int old_width, const int channels, const int padded_depth, + const int padded_height, const int padded_width, const int pad_head, const int pad_top, + const int pad_left, const float pad_value, T *output, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalPadGradNDHWC(const size_t size, const T *dy, const int num, const int old_depth, + const int old_height, const int old_width, const int channels, + const int padded_depth, const int padded_height, const int padded_width, + const int pad_head, const int pad_top, const int pad_left, T *dx, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cu similarity index 83% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cu index 9cfb1948b7c..ed560e6cd06 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cu @@ -14,9 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template __global__ void CalPReLUGradKernel(size_t size, size_t weight_size, size_t per_channel_size, @@ -62,7 +61,7 @@ void CalPReLUGrad(size_t size, size_t weight_size, size_t per_channel_size, return; } -template void CalPReLUGrad(size_t, size_t, size_t, const float *, const float *, const float *, - float *, float *, float *, cudaStream_t); -template void CalPReLUGrad(size_t, size_t, size_t, const half *, const half *, const half *, - half *, half *, float *, cudaStream_t); +template CUDA_LIB_EXPORT void CalPReLUGrad(size_t, size_t, size_t, const float *, const float *, const float *, + float *, float *, float *, cudaStream_t); +template CUDA_LIB_EXPORT void CalPReLUGrad(size_t, size_t, size_t, const half *, const half *, const half *, + half *, half *, float *, cudaStream_t); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cuh index 90bbda6bc05..8f31c2ed5f9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cuh @@ -14,12 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_GRAD_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_GRAD_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalPReLUGrad(size_t input_size, size_t weight_size, size_t per_channel_size, - const T *dy, const T *x, const T *w, T *dx, T *dw, float *dw_array, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_GRAD_H_ +CUDA_LIB_EXPORT void CalPReLUGrad(size_t input_size, size_t weight_size, size_t per_channel_size, const T *dy, + const T *x, const T *w, T *dx, T *dw, float *dw_array, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cu similarity index 81% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cu index 2b1f687bbfb..62d3759776b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/prelu_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void CalPReLUKernel(size_t size, size_t weight_size, size_t per_channel_size, @@ -32,5 +33,5 @@ void CalPReLU(size_t size, size_t weight_size, size_t per_channel_size, input, weight, output); } -template void CalPReLU(size_t, size_t, size_t, const float *, const float *, float *, cudaStream_t); -template void CalPReLU(size_t, size_t, size_t, const half *, const half *, half *, cudaStream_t); +template CUDA_LIB_EXPORT void CalPReLU(size_t, size_t, size_t, const float *, const float *, float *, cudaStream_t); +template CUDA_LIB_EXPORT void CalPReLU(size_t, size_t, size_t, const half *, const half *, half *, cudaStream_t); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cuh similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cuh index eb8c45486f5..0d74034c63f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/prelu_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cuh @@ -14,12 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalPReLU(size_t input_size, size_t weight_size, size_t per_channel_size, - const T *input, const T *weight, T *output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_H_ +CUDA_LIB_EXPORT void CalPReLU(size_t input_size, size_t weight_size, size_t per_channel_size, + const T *input, const T *weight, T *output, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PRELU_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cu similarity index 75% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cu index 77b39e037be..c8790db1dbe 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cu @@ -18,9 +18,8 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template __global__ void PSROIPoolInitKernel(size_t size_init, T *input) { @@ -113,19 +112,21 @@ void PSROIPoolForwardLauncher( } } -template void PSROIPoolForwardLauncher( - const float* input, const float spatial_scale, const int rois_number, const int feature_height, - const int feature_width, const int feature_channels, const int pooled_height, - const int pooled_width, const float* roi_boxes, - const int group_size, const int output_channels, - float* output_data, int* mapping_channel, cudaStream_t stream); +template CUDA_LIB_EXPORT void PSROIPoolForwardLauncher(const float* input, const float spatial_scale, + const int rois_number, const int feature_height, + const int feature_width, const int feature_channels, + const int pooled_height, const int pooled_width, + const float* roi_boxes, const int group_size, + const int output_channels, float* output_data, + int* mapping_channel, cudaStream_t stream); -template void PSROIPoolForwardLauncher( - const half *input, const half spatial_scale, const int rois_number, const int feature_height, - const int feature_width, const int feature_channels, const int pooled_height, - const int pooled_width, const half *roi_boxes, - const int group_size, const int output_channels, - half *output_data, int* mapping_channel, cudaStream_t stream); +template CUDA_LIB_EXPORT void PSROIPoolForwardLauncher(const half *input, const half spatial_scale, + const int rois_number, const int feature_height, + const int feature_width, const int feature_channels, + const int pooled_height, const int pooled_width, + const half *roi_boxes, const int group_size, + const int output_channels, half *output_data, + int* mapping_channel, cudaStream_t stream); template __global__ void PSROIPoolBackward(const int nthreads, const T* input_diff, @@ -209,12 +210,18 @@ void PSROIPoolBackwardLauncher(const T* input_diff, const int* mapping_channel, } } -template void PSROIPoolBackwardLauncher(const float* input_diff, const int* mapping_channel, - const int batch_size, const int rois_number, const float spatial_scale, const int feature_channels, - const int feature_height, const int feature_width, const int pooled_width, const int pooled_height, - const int output_channels, float* output_diff, const float* roi_boxes, cudaStream_t stream); +template CUDA_LIB_EXPORT void PSROIPoolBackwardLauncher(const float* input_diff, const int* mapping_channel, + const int batch_size, const int rois_number, + const float spatial_scale, const int feature_channels, + const int feature_height, const int feature_width, + const int pooled_width, const int pooled_height, + const int output_channels, float* output_diff, + const float* roi_boxes, cudaStream_t stream); -template void PSROIPoolBackwardLauncher(const half* input_diff, const int* mapping_channel, const int batch_size, - const int rois_number, const half spatial_scale, const int feature_channels, const int feature_height, - const int feature_width, const int pooled_width, const int pooled_height, const int output_channels, - half* output_diff, const half* roi_boxes, cudaStream_t stream); +template CUDA_LIB_EXPORT void PSROIPoolBackwardLauncher(const half* input_diff, const int* mapping_channel, + const int batch_size, const int rois_number, + const half spatial_scale, const int feature_channels, + const int feature_height, const int feature_width, + const int pooled_width, const int pooled_height, + const int output_channels, half* output_diff, + const half* roi_boxes, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh new file mode 100644 index 00000000000..170906170b1 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh @@ -0,0 +1,37 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PSROI_POOLING_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PSROI_POOLING_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void PSROIPoolForwardLauncher(const T* input, const T spatial_scale, const int rois_number, + const int feature_height, const int feature_width, + const int feature_channels, const int pooled_height, + const int pooled_width, const T* roi_boxes, const int group_size, + const int output_channels, T* output_data, int* mapping_channel, + cudaStream_t stream); + +template +CUDA_LIB_EXPORT void PSROIPoolBackwardLauncher(const T* input_diff, const int* mapping_channel, const int batch_size, + const int rois_number, const T spatial_scale, const int feature_channels, + const int feature_height, const int feature_width, + const int pooled_width, const int pooled_height, + const int output_channels, T* output_diff, const T* roi_boxes, + cudaStream_t stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_PSROI_POOLING_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cu similarity index 61% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cu index 79dedd603e0..000de8f5efa 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/random_categorical.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cuh" +#include "include/cuda_fp16.h" template __global__ void RandomCategorical(const size_t num_samples, double** dev_rand, double** dev_cdf, @@ -72,19 +73,22 @@ void GetCdfKernel(const T *logits_addr, double** dev_cdf, const size_t batch_siz GetCdf<<>>(logits_addr, dev_cdf, batch_size, num_classes); } -template void GetCdfKernel(const half *logits_addr, double** dev_cdf, const size_t batch_size, - const size_t num_classes, cudaStream_t cuda_stream); -template void GetCdfKernel(const float *logits_addr, double** dev_cdf, const size_t batch_size, - const size_t num_classes, cudaStream_t cuda_stream); -template void GetCdfKernel(const double *logits_addr, double** dev_cdf, const size_t batch_size, - const size_t num_classes, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void GetCdfKernel(const half *logits_addr, double** dev_cdf, const size_t batch_size, + const size_t num_classes, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void GetCdfKernel(const float *logits_addr, double** dev_cdf, const size_t batch_size, + const size_t num_classes, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void GetCdfKernel(const double *logits_addr, double** dev_cdf, const size_t batch_size, + const size_t num_classes, cudaStream_t cuda_stream); -template void RandomCategoricalKernel(const size_t num_samples, - double** dev_rand, double** dev_cdf, const size_t batch_size, const size_t num_classes, - int16_t *output_addr, cudaStream_t cuda_stream); -template void RandomCategoricalKernel(const size_t num_samples, - double** dev_rand, double** dev_cdf, const size_t batch_size, const size_t num_classes, - int *output_addr, cudaStream_t cuda_stream); -template void RandomCategoricalKernel(const size_t num_samples, - double** dev_rand, double** dev_cdf, const size_t batch_size, const size_t num_classes, - int64_t *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void RandomCategoricalKernel(const size_t num_samples, double** dev_rand, + double** dev_cdf, const size_t batch_size, + const size_t num_classes, int16_t *output_addr, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void RandomCategoricalKernel(const size_t num_samples, double** dev_rand, + double** dev_cdf, const size_t batch_size, + const size_t num_classes, int *output_addr, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void RandomCategoricalKernel(const size_t num_samples, double** dev_rand, + double** dev_cdf, const size_t batch_size, + const size_t num_classes, int64_t *output_addr, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cuh new file mode 100644 index 00000000000..fcaacde2018 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cuh @@ -0,0 +1,28 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CATEGORICAL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CATEGORICAL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void GetCdfKernel(const T *logits_addr, double** dev_cdf, const size_t batch_size, + const size_t num_classes, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void RandomCategoricalKernel(const size_t num_samples, double** dev_rand, double** dev_cdf, + const size_t batch_size, const size_t num_classes, S *output_addr, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CATEGORICAL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cu similarity index 91% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cu index da0f65ba428..a0a9b3c2d72 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh" #include int RcwmRoundUpPower2(int v) { @@ -257,8 +257,9 @@ void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size, index_buff, rank_buff, Tnum_buff); } -template void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size, const int &d1, const int &d2, - const int &d3, const int &d4, const int &d5, const int &seedc, const int &count, - const bool *input, int *output_index, bool *output_mask, int *index_buff, - int *mask_buff, int *rank_buff, int *Tnum_buff, int *tmp_buff, - curandState *globalState, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size, const int &d1, + const int &d2, const int &d3, const int &d4, const int &d5, + const int &seedc, const int &count, const bool *input, + int *output_index, bool *output_mask, int *index_buff, + int *mask_buff, int *rank_buff, int *Tnum_buff, int *tmp_buff, + curandState *globalState, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh new file mode 100644 index 00000000000..e64e60a78e5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh @@ -0,0 +1,38 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_ +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#define BLOCKSIZE 256 +#define MAX_DIMENSION 5 + +template +CUDA_LIB_EXPORT void CalRandomChoiceWithMaskSmall(int input_size, int seedc, int count, K *input, S *output_index, + K *output_mask, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size, const int &d1, + const int &d2, const int &d3, const int &d4, const int &d5, + const int &seedc, const int &count, const T *input, S *output_index, + T *output_mask, S *index_buff, S *mask_buff, S *rank_buff, + S *Tnum_buff, S *tmp_buff, curandState *globalState, cudaStream_t stream); + +CUDA_LIB_EXPORT int RcwmRoundUpPower2(int v); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cu similarity index 71% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cu index 2d1998f7dd6..e6c5a33b0e0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cu @@ -100,17 +100,17 @@ void UniformReal(int seed, int seed2, curandState *globalState, T *output, size_ return; } -template void StandardNormal(int seed, int seed2, curandState *globalState, - float *output, size_t count, cudaStream_t cuda_stream); -template void StandardNormal(int seed, int seed2, curandState *globalState, - int *output, size_t count, cudaStream_t cuda_stream); -template bool UniformInt(int seed, int seed2, curandState *globalState, float *input1, size_t input_size_1, - float *input2, size_t input_size_2, float *output, size_t count, - cudaStream_t cuda_stream); -template bool UniformInt(int seed, int seed2, curandState *globalState, int *input1, size_t input_size_1, - int *input2, size_t input_size_2, int *output, size_t count, - cudaStream_t cuda_stream); -template void UniformReal(int seed, int seed2, curandState *globalState, - float *output, size_t count, cudaStream_t cuda_stream); -template void UniformReal(int seed, int seed2, curandState *globalState, - int *output, size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StandardNormal(int seed, int seed2, curandState *globalState, + float *output, size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StandardNormal(int seed, int seed2, curandState *globalState, + int *output, size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT bool UniformInt(int seed, int seed2, curandState *globalState, float *input1, + size_t input_size_1, float *input2, size_t input_size_2, float *output, + size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT bool UniformInt(int seed, int seed2, curandState *globalState, int *input1, + size_t input_size_1, int *input2, size_t input_size_2, int *output, + size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UniformReal(int seed, int seed2, curandState *globalState, + float *output, size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UniformReal(int seed, int seed2, curandState *globalState, + int *output, size_t count, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cuh new file mode 100644 index 00000000000..00c4ba3656b --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cuh @@ -0,0 +1,33 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_OP_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_OP_IMPL_CUH_ +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void StandardNormal(int seed, int seed2, curandState *globalState, + T *output, size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT bool UniformInt(int seed, int seed2, curandState *globalState, + T *input1, size_t input_size_1, T *input2, size_t input_size_2, + T *output, size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void UniformReal(int seed, int seed2, curandState *globalState, + T *output, size_t count, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANDOM_OP_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cu similarity index 72% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cu index afc3a50a8aa..997dbaaf3c3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cu @@ -16,7 +16,6 @@ #include #include "range_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" template __global__ void Range(const int size, const float start, const float limit, const float delta, const T *input, @@ -32,8 +31,8 @@ void CalRange(const int size, const float start, const float limit, const float Range<<>>(size, start, limit, delta, input, output); return; } -template void CalRange(const int size, const float start, const float limit, const float delta, - const float *input, float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalRange(const int size, const float start, const float limit, const float delta, + const float *input, float *output, cudaStream_t cuda_stream); -template void CalRange(const int size, const float start, const float limit, const float delta, const int *input, - int *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalRange(const int size, const float start, const float limit, const float delta, + const int *input, int *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cuh similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cuh index d0fdbc5948b..c1aa34f1842 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cuh @@ -1,24 +1,23 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMGRAD_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMGRAD_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void CalBatchNormGrad(T *x, T *dy, float *scale, float *save_mean, float *save_variance, T *dx, float *bn_scale, - float *bn_bias, double epsilon, int N, int C, int H, int W, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMGRAD_H_ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANGE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANGE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalRange(const int size, const float start, const float limit, const float delta, const T *input, + T *output, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RANGE_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rcwm_small_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rcwm_small_impl.cu similarity index 91% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rcwm_small_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rcwm_small_impl.cu index d992020489b..57468823855 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rcwm_small_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rcwm_small_impl.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/topk_lib.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_lib.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh" // Kernel started from here #define L2_RCWM_HELPER(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, IS_DESCEND) \ @@ -148,5 +148,6 @@ void CalRandomChoiceWithMaskSmall(int input_size, int seedc, int count, K *input RCWMScaleK(seedc, input_size, input, count, output_index, output_mask, stream); } -template void CalRandomChoiceWithMaskSmall(int input_size, int seedc, int count, bool *input, - int *output_index, bool *output_mask, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalRandomChoiceWithMaskSmall(int input_size, int seedc, int count, + bool *input, int *output_index, + bool *output_mask, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cu similarity index 62% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cu index 9fbc8013dbd..a8c0831acc6 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cu @@ -15,9 +15,7 @@ */ #include - #include "real_to_complex_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" template __global__ void ToComplex(const size_t size, const T *input, T *output, cudaStream_t cuda_stream) { @@ -33,8 +31,11 @@ void RealToComplex(const size_t size, const T *input, T *output, cudaStream_t cu ToComplex<<>>(size, input, output, cuda_stream); } -template void RealToComplex(const size_t size, const double *input, double *output, cudaStream_t cuda_stream); -template void RealToComplex(const size_t size, const float *input, float *output, cudaStream_t cuda_stream); -template void RealToComplex(const size_t size, const int *input, int *output, cudaStream_t cuda_stream); -template void RealToComplex(const size_t size, const int64_t *input, int64_t *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void RealToComplex(const size_t size, const double *input, double *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void RealToComplex(const size_t size, const float *input, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void RealToComplex(const size_t size, const int *input, int *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void RealToComplex(const size_t size, const int64_t *input, int64_t *output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cuh old mode 100644 new mode 100755 similarity index 60% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cuh index 2d0aabc5d44..7a5daa94a8b --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/range_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cuh @@ -14,10 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANGE_IMPL_CUH_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANGE_IMPL_CUH_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REAL_TO_COMPLEX_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REAL_TO_COMPLEX_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalRange(const int size, const float start, const float limit, const float delta, const T *input, T *output, - cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANGE_IMPL_CUH +CUDA_LIB_EXPORT void RealToComplex(const size_t size, const T *input, T *output, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REAL_TO_COMPLEX_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cu similarity index 51% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cu index 29084bf0f84..926e2b31b22 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void CalReLUGradKernel(int size, T *dy, T *y, T *dx) { @@ -30,11 +30,11 @@ void CalReLUGrad(int size, T *dy, T *y, T *dx, cudaStream_t cuda_stream) { return; } -template void CalReLUGrad(int size, double *dy, double *y, double *dx, cudaStream_t cuda_stream); -template void CalReLUGrad(int size, float *dy, float *y, float *dx, cudaStream_t cuda_stream); -template void CalReLUGrad(int size, half *dy, half *y, half *dx, cudaStream_t cuda_stream); -template void CalReLUGrad(int size, int8_t *dy, int8_t *y, int8_t *dx, cudaStream_t cuda_stream); -template void CalReLUGrad(int size, int16_t *dy, int16_t *y, int16_t *dx, cudaStream_t cuda_stream); -template void CalReLUGrad(int size, int32_t *dy, int32_t *y, int32_t *dx, cudaStream_t cuda_stream); -template void CalReLUGrad(int size, int64_t *dy, int64_t *y, int64_t *dx, cudaStream_t cuda_stream); -template void CalReLUGrad(int size, uint8_t *dy, uint8_t *y, uint8_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLUGrad(int size, double *dy, double *y, double *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLUGrad(int size, float *dy, float *y, float *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLUGrad(int size, half *dy, half *y, half *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLUGrad(int size, int8_t *dy, int8_t *y, int8_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLUGrad(int size, int16_t *dy, int16_t *y, int16_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLUGrad(int size, int32_t *dy, int32_t *y, int32_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLUGrad(int size, int64_t *dy, int64_t *y, int64_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLUGrad(int size, uint8_t *dy, uint8_t *y, uint8_t *dx, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cuh similarity index 59% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cuh index 9a392aef20d..b465b918301 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cuh @@ -14,10 +14,9 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_GRAD_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_GRAD_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalReLUGrad(int input_size, T *dy, T *y, T *dx, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_GRAD_H_ +CUDA_LIB_EXPORT void CalReLUGrad(int input_size, T *dy, T *y, T *dx, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cu new file mode 100644 index 00000000000..a4cef64d7f5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cu @@ -0,0 +1,106 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" + +template +__global__ void CalReLUKernel(int size, T *input_addr, T *output_addr) { + for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { + output_addr[pos] = input_addr[pos] > static_cast(0) ? input_addr[pos] : static_cast(0); + } +} + +template +void CalReLU(int size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { + CalReLUKernel<<>>(size, input_addr, output_addr); +} + +template CUDA_LIB_EXPORT void CalReLU(int size, double *input_addr, double *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLU(int size, float *input_addr, float *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLU(int size, half *input_addr, half *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLU(int size, int8_t *input_addr, int8_t *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLU(int size, int16_t *input_addr, int16_t *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLU(int size, int32_t *input_addr, int32_t *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLU(int size, int64_t *input_addr, int64_t *output_addr, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReLU(int size, uint8_t *input_addr, uint8_t *output_addr, cudaStream_t cuda_stream); + +template +__global__ void ReluV2Kernel(const size_t num, const T *x, T *y, uint32_t *mask) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x) { + T v = x[i]; + bool p = v > static_cast(0); + y[i] = p ? v : static_cast(0); + + auto warp_predict = BallotSync(p, __activemask()); + if (LaneId() == 0) { + mask[WarpId(i)] = warp_predict; + } + } +} + +template +void ReluV2(const size_t num, const T *x, T *y, uint32_t *mask, cudaStream_t cuda_stream) { + ReluV2Kernel<<>>(num, x, y, mask); +} + +template +__global__ void ReluGradV2Kernel(const size_t num, const T *dy, const uint32_t *mask, T *dx) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x) { + bool p = mask[WarpId(i)] & (1 << LaneId()); + dx[i] = p ? dy[i] : static_cast(0); + } +} + +template +void ReluGradV2(const size_t num, const T *dy, const uint32_t *mask, T *dx, cudaStream_t cuda_stream) { + ReluGradV2Kernel<<>>(num, dy, mask, dx); +} + +template CUDA_LIB_EXPORT void ReluV2(const size_t num, const double *x, double *y, uint32_t *mask, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluV2(const size_t num, const float *x, float *y, uint32_t *mask, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluV2(const size_t num, const half *x, half *y, uint32_t *mask, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluV2(const size_t num, const int8_t *x, int8_t *y, uint32_t *mask, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluV2(const size_t num, const int16_t *x, int16_t *y, uint32_t *mask, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluV2(const size_t num, const int32_t *x, int32_t *y, uint32_t *mask, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluV2(const size_t num, const int64_t *x, int64_t *y, uint32_t *mask, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluV2(const size_t num, const uint8_t *x, uint8_t *y, uint32_t *mask, + cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const double *dy, const uint32_t *mask, double *dx, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const float *dy, const uint32_t *mask, float *dx, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const half *dy, const uint32_t *mask, half *dx, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const int8_t *dy, const uint32_t *mask, int8_t *dx, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const int16_t *dy, const uint32_t *mask, int16_t *dx, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const int32_t *dy, const uint32_t *mask, int32_t *dx, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const int64_t *dy, const uint32_t *mask, int64_t *dx, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const uint8_t *dy, const uint32_t *mask, uint8_t *dx, + cudaStream_t cuda_stream); + diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh similarity index 51% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh index 134aed477d7..9b68cd2857b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh @@ -14,15 +14,14 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalReLU(int input_size, T *input_addr, T *output_addr, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalReLU(int input_size, T *input_addr, T *output_addr, cudaStream_t cuda_stream); template -void ReluV2(const size_t num, const T *x, T *y, uint32_t *mask, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void ReluV2(const size_t num, const T *x, T *y, uint32_t *mask, cudaStream_t cuda_stream); template -void ReluGradV2(const size_t num, const T *dy, const uint32_t *mask, T *dx, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_H_ +CUDA_LIB_EXPORT void ReluGradV2(const size_t num, const T *dy, const uint32_t *mask, T *dx, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RELU_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cu similarity index 88% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cu index 3c508bee832..131d6d15e8b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cu @@ -14,10 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "include/cuda_fp16.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template __global__ void ResizeBilinear(const T *input, const int n, const int c, const int input_h, const int input_w, const int output_h, const int output_w, const int nchw, const int chw, const int hw, const float h_scale, @@ -157,9 +155,11 @@ void CalResizeBilinearGrad(const float *input, const int n, const int c, const i return; } -template void CalResizeBilinear(const float *input, const int n, const int c, const int input_h, - const int input_w, const int output_h, const int output_w, const float h_scale, const float w_scale, float *output, - cudaStream_t cuda_stream); -template void CalResizeBilinear(const half *input, const int n, const int c, const int input_h, - const int input_w, const int output_h, const int output_w, const float h_scale, const float w_scale, half *output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalResizeBilinear(const float *input, const int n, const int c, const int input_h, + const int input_w, const int output_h, const int output_w, + const float h_scale, const float w_scale, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalResizeBilinear(const half *input, const int n, const int c, const int input_h, + const int input_w, const int output_h, const int output_w, + const float h_scale, const float w_scale, half *output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh new file mode 100644 index 00000000000..710b5d03886 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh @@ -0,0 +1,33 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_BILINEAR_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_BILINEAR_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#include "include/cuda_fp16.h" +template +CUDA_LIB_EXPORT void CalResizeBilinear(const T *input, const int n_, const int c_, const int input_h_, + const int input_w_, const int output_h_, const int output_w_, + const float h_scale, const float w_scale, T *output, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalResizeBilinearGrad(const half *input, const int n_, const int c_, const int input_h_, + const int input_w_, const int output_h_, const int output_w_, + const float h_scale, const float w_scale, half *output, float *interim, + cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalResizeBilinearGrad(const float *input, const int n_, const int c_, const int input_h_, + const int input_w_, const int output_h_, const int output_w_, + const float h_scale, const float w_scale, float *output, float *interim, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_BILINEAR_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cu similarity index 64% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cu index e2b8209d1cf..0ea9ae0dc38 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cu @@ -18,8 +18,8 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh" template __global__ void InitZero(T *output, const int output_size) { @@ -76,15 +76,19 @@ void CalResizeNearestNeighborGrad(const int input_size, const T *input, const in return; } -template void CalResizeNearestNeighborGrad(const int input_size, const float *input, const int s1, const int s2, - const int s3, const int s4, float *output, const int d1, const int d2, - const int d3, const int d4, bool align_corners, float h_scale, - float w_scale, cudaStream_t cuda_stream); -template void CalResizeNearestNeighborGrad(const int input_size, const half *input, const int s1, const int s2, - const int s3, const int s4, half *output, const int d1, const int d2, - const int d3, const int d4, bool align_corners, float h_scale, - float w_scale, cudaStream_t cuda_stream); -template void CalResizeNearestNeighborGrad(const int input_size, const int *input, const int s1, const int s2, - const int s3, const int s4, int *output, const int d1, const int d2, - const int d3, const int d4, bool align_corners, float h_scale, - float w_scale, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalResizeNearestNeighborGrad(const int input_size, const float *input, + const int s1, const int s2, const int s3, + const int s4, float *output, const int d1, + const int d2, const int d3, const int d4, + bool align_corners, float h_scale, float w_scale, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalResizeNearestNeighborGrad(const int input_size, const half *input, const int s1, + const int s2, const int s3, const int s4, half *output, + const int d1, const int d2, const int d3, const int d4, + bool align_corners, float h_scale, float w_scale, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalResizeNearestNeighborGrad(const int input_size, const int *input, const int s1, + const int s2, const int s3, const int s4, int *output, + const int d1, const int d2, const int d3, const int d4, + bool align_corners, float h_scale, float w_scale, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh new file mode 100644 index 00000000000..ec156628e7c --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh @@ -0,0 +1,29 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#define RESIZENEARESTNEIGHBORGRAD_DIMENSION 4 + +template +CUDA_LIB_EXPORT void CalResizeNearestNeighborGrad(const int input_size, const T *input, const int s1, const int s2, + const int s3, const int s4, T *output, const int d1, const int d2, + const int d3, const int d4, bool align_corners, float h_scale, + float w_scale, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cu similarity index 63% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cu index 3186b1c4566..ac80d937697 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cu @@ -18,7 +18,8 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void ResizeNearestNeighbor(const int size, const T *input, const int s1, const int s2, const int s3, @@ -65,15 +66,18 @@ void CalResizeNearestNeighbor(const int size, const T *input, const int s1, cons return; } -template void CalResizeNearestNeighbor(const int size, const float *input, const int s1, const int s2, - const int s3, const int s4, float *output, const int d1, const int d2, - const int d3, const int d4, bool align_corners, float h_scale, - float w_scale, cudaStream_t cuda_stream); -template void CalResizeNearestNeighbor(const int size, const half *input, const int s1, const int s2, - const int s3, const int s4, half *output, const int d1, const int d2, - const int d3, const int d4, bool align_corners, float h_scale, - float w_scale, cudaStream_t cuda_stream); -template void CalResizeNearestNeighbor(const int size, const int *input, const int s1, const int s2, const int s3, - const int s4, int *output, const int d1, const int d2, const int d3, - const int d4, bool align_corners, float h_scale, float w_scale, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalResizeNearestNeighbor(const int size, const float *input, const int s1, + const int s2, const int s3, const int s4, float *output, + const int d1, const int d2, const int d3, const int d4, + bool align_corners, float h_scale, float w_scale, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalResizeNearestNeighbor(const int size, const half *input, const int s1, + const int s2, const int s3, const int s4, half *output, + const int d1, const int d2, const int d3, const int d4, + bool align_corners, float h_scale, float w_scale, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalResizeNearestNeighbor(const int size, const int *input, const int s1, + const int s2, const int s3, const int s4, int *output, + const int d1, const int d2, const int d3, const int d4, + bool align_corners, float h_scale, float w_scale, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh new file mode 100644 index 00000000000..b2651e1252d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh @@ -0,0 +1,29 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#define RESIZENEARESTNEIGHBOR_DIMENSION 4 + +template +CUDA_LIB_EXPORT void CalResizeNearestNeighbor(const int size, const T *input, const int s1, const int s2, const int s3, + const int s4, T *output, const int d1, const int d2, const int d3, + const int d4, bool align_corners, float h_scale, float w_scale, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cu new file mode 100644 index 00000000000..4e3b69d95ec --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cu @@ -0,0 +1,189 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh" +#include "include/cuda_fp16.h" + +// Util function to convert a 1D input array index to an N-D positional index +// Required since GPU iterates over all values in an ND array as a 1D array +__inline__ __device__ void IdxToPos(size_t idx, size_t *pos, size_t cur_thread_idx, size_t *cum_shape, + size_t shape_size) { + size_t rem_val = idx; + for (int i = 0; i < shape_size; i++) { + pos[cur_thread_idx + i] = rem_val / cum_shape[i]; + rem_val = rem_val % cum_shape[i]; + } + return; +} + +// Util function to convert a N-D positonal index to a 1D index +__inline__ __device__ size_t PosToIdx(size_t *pos, size_t cur_thread_idx, size_t *cum_shape, size_t shape_size) { + size_t idx = 0; + for (int i = 0; i < shape_size; i++) { + idx = idx + (pos[cur_thread_idx + i] * cum_shape[i]); + } + return idx; +} + +// CumShape takes Shape: (2,2,5) => cumShape (10,5,1) which informs how many values +// each dimension will represent. Required for converting 1d index to positional vector. +// In this example 10 in dim 0 means, an increase of 1 in this dim leads to another 10 values +// in the overall array +__global__ void ComputeCumShape(const size_t *input_shape_ptr, size_t *input_shape_cum_ptr, size_t shape_size) { + int cur_val = 1; + for (int i = shape_size - 1; i >= 0; i--) { + // iterate list in reverse and cummulatively build shape + input_shape_cum_ptr[i] = cur_val; + cur_val = cur_val * input_shape_ptr[i]; + } + return; +} +template +__global__ void ReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr, + size_t *input_shape_cum_ptr, size_t shape_size, T *output) { + // calculate which thread this is out of total across all blocks for accessing respective cur_pos_arr memory + size_t cur_thread_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + cur_thread_idx = cur_thread_idx * shape_size; + size_t cur_slice = 0; // current slice as split by the batch_dim + size_t cur_slice_seq_len = 0; // reverse seq length for this slice as provided by user + size_t new_idx = 0; // calculate corresponding reverse element from input + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { + IdxToPos(idx, cur_pos_arr, cur_thread_idx, input_shape_cum_ptr, shape_size); + cur_slice = cur_pos_arr[cur_thread_idx + batch_dim]; // all accesses to cur_pos_arr have to be adjusted per thread + cur_slice_seq_len = seq_len[cur_slice]; + if (cur_slice_seq_len == 0) { // adjust length to 1 if 0 provided, same result in both cases + cur_slice_seq_len = 1; + } + if (cur_pos_arr[cur_thread_idx + seq_dim] > (cur_slice_seq_len - 1)) { // check if within range + // copy value directly and continue - outside of reversal range + output[idx] = input[idx]; + continue; + } + // find corresponding reverse element in input + cur_pos_arr[cur_thread_idx + seq_dim] = + (cur_slice_seq_len - 1) - cur_pos_arr[cur_thread_idx + seq_dim]; // adjust position to target + new_idx = PosToIdx(cur_pos_arr, cur_thread_idx, input_shape_cum_ptr, shape_size); // get the updated index + output[idx] = input[new_idx]; + } + return; +} + +template +CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr, + size_t *input_shape_cum_ptr, size_t shape_size, T *output, + cudaStream_t cuda_stream) { + ComputeCumShape<<<1, 1, 0, cuda_stream>>>(input_shape_ptr, input_shape_cum_ptr, shape_size); + ReverseSequence<<>>( + size, input, seq_len, batch_dim, seq_dim, cur_pos_arr, input_shape_ptr, input_shape_cum_ptr, shape_size, output); + return; +} + +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const int8_t *input, + const int *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + int8_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const int8_t *input, + const int64_t *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + int8_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const int16_t *input, + const int *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + int16_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const int16_t *input, + const int64_t *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + int16_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const int *input, const int *seq_len, + const int64_t batch_dim, const int64_t seq_dim, + size_t *cur_pos_arr, const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, int *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const int *input, + const int64_t *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + int *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const int64_t *input, + const int *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + int64_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const int64_t *input, + const int64_t *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + int64_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const half *input, const int *seq_len, + const int64_t batch_dim, const int64_t seq_dim, + size_t *cur_pos_arr, const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const half *input, + const int64_t *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const float *input, const int *seq_len, + const int64_t batch_dim, const int64_t seq_dim, + size_t *cur_pos_arr, const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const float *input, + const int64_t *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const double *input, + const int *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + double *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const double *input, + const int64_t *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + double *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const bool *input, const int *seq_len, + const int64_t batch_dim, const int64_t seq_dim, + size_t *cur_pos_arr, const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + bool *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const bool *input, + const int64_t *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, + const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, + bool *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh new file mode 100644 index 00000000000..5a0f57451bc --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh @@ -0,0 +1,27 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_SEQUENCE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_SEQUENCE_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim, + const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr, + size_t *intput_shape_cum_ptr, size_t shape_size, T *output, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_SEQUENCE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cu new file mode 100644 index 00000000000..1b48174e5f4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cu @@ -0,0 +1,65 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "reverse_v2_impl.cuh" +#include "include/cuda_fp16.h" +template +__global__ void ReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides, + const int64_t* axis, size_t input_size, size_t axis_size) { + for (int64_t gt_id = blockIdx.x * blockDim.x + threadIdx.x; gt_id < input_size; gt_id += blockDim.x * gridDim.x) { + int64_t intermediate_index = gt_id; + for (size_t i = 0; i < axis_size; i++) { + int64_t d = axis[i]; + int64_t pre_reverse_position = (gt_id / strides[d]) % input_shape[d]; + int64_t reversed_position = input_shape[d] - pre_reverse_position - 1; + intermediate_index += ((reversed_position - pre_reverse_position) * strides[d]); + } + + output[intermediate_index] = input[gt_id]; + } + return; +} +template +void CalReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides, const int64_t* axis, + size_t input_size, size_t axis_size, cudaStream_t cuda_stream) { + ReverseV2<<>>(input, output, input_shape, strides, axis, + input_size, axis_size); + return; +} + +template CUDA_LIB_EXPORT void CalReverseV2(const half* input, half* output, const size_t* input_shape, + const int64_t* strides, const int64_t* axis, size_t input_size, + size_t axis_size, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void CalReverseV2(const float* input, float* output, const size_t* input_shape, + const int64_t* strides, const int64_t* axis, size_t input_size, + size_t axis_size, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void CalReverseV2(const uint8_t* input, uint8_t* output, const size_t* input_shape, + const int64_t* strides, const int64_t* axis, size_t input_size, + size_t axis_size, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void CalReverseV2(const int16_t* input, int16_t* output, const size_t* input_shape, + const int64_t* strides, const int64_t* axis, size_t input_size, + size_t axis_size, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void CalReverseV2(const int32_t* input, int32_t* output, const size_t* input_shape, + const int64_t* strides, const int64_t* axis, size_t input_size, + size_t axis_size, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void CalReverseV2(const int64_t* input, int64_t* output, const size_t* input_shape, + const int64_t* strides, const int64_t* axis, size_t input_size, + size_t axis_size, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cuh index e1aa136e5fb..7edfa6c7411 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cuh @@ -13,9 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_V2_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_V2_CUH_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_V2_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_V2_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides, const int64_t* axis, - size_t input_size, size_t axis_size, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_V2_CUH_ +CUDA_LIB_EXPORT void CalReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides, + const int64_t* axis, size_t input_size, size_t axis_size, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_REVERSE_V2_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cu similarity index 92% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cu index 4a2b68e70bc..ee6f93f0f73 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cu @@ -15,8 +15,7 @@ */ #include -#include "plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cuh" template __global__ void RmsPropKernel(const T* learning_rate, const T decay, const T momentum, const T epsilon, T* variable, @@ -57,12 +56,12 @@ void RmsPropCenter(const T* learning_rate, const T* decay, const T* momentum, co moment, gradients, size); } -template +template CUDA_LIB_EXPORT void RmsProp(const float* learning_rate, const float decay, const float momentum, const float epsilon, - float* variable, float* mean_square, float* moment, float* gradients, const size_t size, - cudaStream_t cuda_stream); + float* variable, float* mean_square, float* moment, float* gradients, const size_t size, + cudaStream_t cuda_stream); -template +template CUDA_LIB_EXPORT void RmsPropCenter(const float* learning_rate, const float* decay, const float* momentum, const float* epsilon, float* variable, float* mean_gradients, float* mean_square, float*moment, float* gradients, const size_t size, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cuh new file mode 100644 index 00000000000..e87991023cc --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cuh @@ -0,0 +1,30 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RMSPROP_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RMSPROP_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void RmsProp(const T* learning_rate, const T decay, const T momentum, const T epsilon, T* variable, + T* mean_square, T* moment, T* gradients, const size_t size, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void RmsPropCenter(const T* learning_rate, const T* decay, const T* momentum, const T* epsilon, + T* variable, T* mean_gradients, T* mean_square, T* moment, T* gradients, + const size_t size, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_RMSPROP_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cu similarity index 85% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cu index 11b783976ac..9c718b99b2a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cu @@ -16,7 +16,6 @@ #include "roi_align_impl.cuh" #include "util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" inline __device__ int roi_cast_int(float x) { return __float2int_rd(x); } inline __device__ int roi_cast_int(half x) { return __half2int_rd(x); } @@ -182,15 +181,16 @@ void ROIAlign(const T *x, const T *roi_boxes, int roi_rows, int roi_cols, T *out height, width, pooled_height, pooled_width); } -template void ROIAlign(const float *x, const float *roi_boxes, int roi_rows, int roi_cols, float *out_data, - const float spatial_scale, const int sample_num, int roi_end_mode, const int channels, - const int height, const int width, const int pooled_height, const int pooled_width, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ROIAlign(const float *x, const float *roi_boxes, int roi_rows, int roi_cols, + float *out_data, const float spatial_scale, const int sample_num, + int roi_end_mode, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + cudaStream_t cuda_stream); -template void ROIAlign(const half *x, const half *roi_boxes, int roi_rows, int roi_cols, half *out_data, - const half spatial_scale, const int sample_num, int roi_end_mode, const int channels, - const int height, const int width, const int pooled_height, const int pooled_width, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ROIAlign(const half *x, const half *roi_boxes, int roi_rows, int roi_cols, + half *out_data, const half spatial_scale, const int sample_num, + int roi_end_mode, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, cudaStream_t cuda_stream); template __global__ void ROIAlignGradInitKernel(size_t size_init, T *dx) { @@ -275,12 +275,14 @@ void ROIAlignGrad(const T *dy, const T *roi_boxes, int batch_size, int roi_rows, pooled_width); } -template void ROIAlignGrad(const float *dy, const float *roi_boxes, int batch_size, int roi_rows, int roi_cols, - float *dx, const float spatial_scale, const int sample_num, int roi_end_mode, - const int channels, const int height, const int width, const int pooled_height, - const int pooled_width, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ROIAlignGrad(const float *dy, const float *roi_boxes, int batch_size, int roi_rows, + int roi_cols, float *dx, const float spatial_scale, + const int sample_num, int roi_end_mode, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, cudaStream_t cuda_stream); -template void ROIAlignGrad(const half *dy, const half *roi_boxes, int batch_size, int roi_rows, int roi_cols, - half *dx, const half spatial_scale, const int sample_num, int roi_end_mode, - const int channels, const int height, const int width, const int pooled_height, - const int pooled_width, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ROIAlignGrad(const half *dy, const half *roi_boxes, int batch_size, int roi_rows, + int roi_cols, half *dx, const half spatial_scale, const int sample_num, + int roi_end_mode, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cuh new file mode 100644 index 00000000000..bd091ab4c02 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cuh @@ -0,0 +1,32 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ROI_ALIGN_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ROI_ALIGN_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void ROIAlign(const T *x, const T *roi_boxes, int roi_rows, int roi_cols, T *out_data, + const T spatial_scale, const int sample_num, int roi_end_mode, const int channels, + const int height, const int width, const int pooled_height, const int pooled_width, + cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void ROIAlignGrad(const T *dy, const T *roi_boxes, int batch_size, int roi_rows, int roi_cols, T *dx, + const T spatial_scale, const int sample_num, int roi_end_mode, const int channels, + const int height, const int width, const int pooled_height, const int pooled_width, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ROI_ALIGN_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cu new file mode 100644 index 00000000000..5de93728879 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cu @@ -0,0 +1,104 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh" + +template +__global__ void ScatterUpdateKernel(const size_t inner_size, const size_t updates_size, const S *indices, + const T *updates, T *input) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) { + const size_t index = pos / inner_size; + const size_t offset = pos % inner_size; + const size_t current_pos = indices[index] * inner_size + offset; + input[current_pos] = updates[pos]; + } +} + +template +__global__ void ScatterAddKernel(const size_t inner_size, const size_t updates_size, const S *indices, const T *updates, + T *input) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) { + const size_t index = pos / inner_size; + const size_t offset = pos % inner_size; + const size_t current_pos = indices[index] * inner_size + offset; + MsAtomicAdd(&input[current_pos], updates[pos]); + } +} + +template +__global__ void ScatterSubKernel(const size_t inner_size, const size_t updates_size, const S *indices, const T *updates, + T *input) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) { + const size_t index = pos / inner_size; + const size_t offset = pos % inner_size; + const size_t current_pos = indices[index] * inner_size + offset; + MsAtomicAdd(&input[current_pos], -updates[pos]); + } +} + +template +void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, const size_t &indices_size, + const S *indices, const T *updates, T *input, cudaStream_t cuda_stream) { + const size_t updates_size = inner_size * indices_size; + switch (func_type) { + case SCATTER_FUNC_UPDATE: + return ScatterUpdateKernel<<>>(inner_size, updates_size, + indices, updates, input); + case SCATTER_FUNC_ADD: + return ScatterAddKernel<<>>(inner_size, updates_size, + indices, updates, input); + case SCATTER_FUNC_SUB: + return ScatterSubKernel<<>>(inner_size, updates_size, + indices, updates, input); + default: + break; + } +} + +template CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, + const size_t &indices_size, const int *indices, + const float *updates, float *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, + const size_t &indices_size, const int64_t *indices, + const float *updates, float *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, + const size_t &indices_size, const int *indices, + const half *updates, half *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, + const size_t &indices_size, const int64_t *indices, + const half *updates, half *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, + const size_t &indices_size, const int *indices, const int *updates, + int *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, + const size_t &indices_size, const int64_t *indices, + const int *updates, int *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, + const size_t &inner_size, const size_t &indices_size, + const int *indices, const unsigned char *updates, + unsigned char *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, + const size_t &inner_size, const size_t &indices_size, + const int64_t *indices, const unsigned char *updates, + unsigned char *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, + const size_t &indices_size, const int *indices, + const int8_t *updates, int8_t *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, + const size_t &indices_size, const int64_t *indices, + const int8_t *updates, int8_t *input, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh similarity index 55% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh index 8c264d0fbc8..4b6fb1d6cde 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh @@ -14,10 +14,9 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_FUNCTOR_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_FUNCTOR_IMPL_CUH_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_FUNCTOR_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_FUNCTOR_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" enum ScatterFunctorType { SCATTER_FUNC_UPDATE = 0, @@ -27,7 +26,8 @@ enum ScatterFunctorType { }; template -void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, const size_t &indices_size, - const S *indices, const T *updates, T *input, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, + const size_t &indices_size, const S *indices, const T *updates, T *input, + cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_FUNCTOR_IMPL_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_FUNCTOR_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cu new file mode 100644 index 00000000000..73da9c2a0f1 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cu @@ -0,0 +1,118 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" + +template +__global__ void ScatterNdKernel(S *indices, T *update, T *output, const size_t block_size, const size_t input_size, + const size_t output_size, const size_t indices_dim_0, const size_t indices_dim_1, + S *indices_stride, S *work_shape) { + int i, j; + for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; + read_index += blockDim.x * gridDim.x) { + size_t write_index = 0; + bool out_bound = false; + + i = read_index / block_size; + j = read_index % block_size; + + for (size_t k = 0; k < indices_dim_1; k++) { + S indices_i = indices[i * indices_dim_1 + k]; + out_bound |= indices_i >= work_shape[k]; + write_index += indices_i * indices_stride[k]; + } + + write_index += j; + out_bound |= write_index >= output_size; + + if (!out_bound) { + MsAtomicAdd(&output[write_index], update[read_index]); + } + } +} + +template +void ScatterNd(S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride, + S *work_shape, cudaStream_t stream) { + ScatterNdKernel<<>>(indices, update, output, block_size, input_size, + output_size, indices_dim_0, indices_dim_1, + indices_stride, work_shape); + return; +} + +template CUDA_LIB_EXPORT void ScatterNd(int *indices, double *update, double *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, int *work_shape, + cudaStream_t stream); +template CUDA_LIB_EXPORT void ScatterNd(int64_t *indices, double *update, double *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void ScatterNd(int *indices, float *update, float *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, int *work_shape, + cudaStream_t stream); +template CUDA_LIB_EXPORT void ScatterNd(int64_t *indices, float *update, float *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void ScatterNd(int *indices, half *update, half *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, const size_t &indices_dim_1, + int *indices_stride, int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void ScatterNd(int64_t *indices, half *update, half *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void ScatterNd(int *indices, int *update, int *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, const size_t &indices_dim_1, + int *indices_stride, int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void ScatterNd(int64_t *indices, int *update, int *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); +// NOLINTNEXTLINE +template CUDA_LIB_EXPORT void ScatterNd(int *indices, short *update, short *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, int *work_shape, + cudaStream_t stream); +// NOLINTNEXTLINE +template CUDA_LIB_EXPORT void ScatterNd(int64_t *indices, short *update, short *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void ScatterNd(int *indices, unsigned char *update, unsigned char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void ScatterNd(int64_t *indices, unsigned char *update, + unsigned char *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh new file mode 100644 index 00000000000..5c159e8aecb --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh @@ -0,0 +1,25 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void ScatterNd(S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, + S *indices_stride, S *work_shape, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cu new file mode 100644 index 00000000000..4363c04d5ed --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cu @@ -0,0 +1,209 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh" + +template +__global__ void ScatterNdUpdate(const size_t unit_size, const size_t index_depth, const size_t updates_size, + const S *out_strides, const S *indices, const T *updates, T *input) { + int i, j; + for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size); + read_index += blockDim.x * gridDim.x) { + size_t write_index = 0; + bool out_bound = false; + + i = read_index / unit_size; + j = read_index % unit_size; + + for (size_t k = 0; k < index_depth; k++) { + S indices_i = indices[i * index_depth + k]; + out_bound |= indices_i < 0; + write_index += indices_i * out_strides[k] * unit_size; + } + + write_index += j; + + if (!out_bound) { + input[write_index] = updates[read_index]; + } + } +} + +template +__global__ void ScatterNdAdd(const size_t unit_size, const size_t index_depth, const size_t updates_size, + const S *out_strides, const S *indices, const T *updates, T *input) { + int i, j; + for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size); + read_index += blockDim.x * gridDim.x) { + size_t write_index = 0; + bool out_bound = false; + + i = read_index / unit_size; + j = read_index % unit_size; + + for (size_t k = 0; k < index_depth; k++) { + S indices_i = indices[i * index_depth + k]; + out_bound |= indices_i < 0; + write_index += indices_i * out_strides[k] * unit_size; + } + + write_index += j; + + if (!out_bound) { + MsAtomicAdd(&input[write_index], updates[read_index]); + } + } +} + +template +__global__ void ScatterNdSub(const size_t unit_size, const size_t index_depth, const size_t updates_size, + const S *out_strides, const S *indices, const T *updates, T *input) { + int i, j; + for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size); + read_index += blockDim.x * gridDim.x) { + size_t write_index = 0; + bool out_bound = false; + + i = read_index / unit_size; + j = read_index % unit_size; + + for (size_t k = 0; k < index_depth; k++) { + S indices_i = indices[i * index_depth + k]; + out_bound |= indices_i < 0; + write_index += indices_i * out_strides[k] * unit_size; + } + + write_index += j; + + if (!out_bound) { + MsAtomicAdd(&input[write_index], -updates[read_index]); + } + } +} + +template +void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, const S *out_strides, const S *indices, const T *updates, T *input, + cudaStream_t cuda_stream) { + const size_t updates_size = unit_size * num_units; + switch (func_type) { + case SCATTER_ND_FUNC_UPDATE: + return ScatterNdUpdate<<>>( + unit_size, index_depth, updates_size, out_strides, indices, updates, input); + case SCATTER_ND_FUNC_ADD: + return ScatterNdAdd<<>>( + unit_size, index_depth, updates_size, out_strides, indices, updates, input); + case SCATTER_ND_FUNC_SUB: + return ScatterNdSub<<>>( + unit_size, index_depth, updates_size, out_strides, indices, updates, input); + default: + break; + } +} + +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int64_t *out_strides, const int64_t *indices, + const double *updates, double *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int32_t *out_strides, const int32_t *indices, + const double *updates, double *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int64_t *out_strides, const int64_t *indices, + const float *updates, float *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int32_t *out_strides, const int32_t *indices, + const float *updates, float *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, const int64_t *out_strides, + const int64_t *indices, const half *updates, + half *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, const int32_t *out_strides, + const int32_t *indices, const half *updates, + half *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int64_t *out_strides, const int64_t *indices, + const int32_t *updates, int32_t *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int32_t *out_strides, const int32_t *indices, + const int32_t *updates, int32_t *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int64_t *out_strides, const int64_t *indices, + const int16_t *updates, int16_t *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int32_t *out_strides, const int32_t *indices, + const int16_t *updates, int16_t *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int64_t *out_strides, const int64_t *indices, + const uint8_t *updates, uint8_t *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int32_t *out_strides, const int32_t *indices, + const uint8_t *updates, uint8_t *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int64_t *out_strides, const int64_t *indices, + const int8_t *updates, int8_t *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, + const int32_t *out_strides, const int32_t *indices, + const int8_t *updates, int8_t *input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, const int64_t *out_strides, + const int64_t *indices, const bool *updates, + bool *input, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, + const size_t &unit_size, const size_t &num_units, + const size_t &index_depth, const int32_t *out_strides, + const int32_t *indices, const bool *updates, + bool *input, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh similarity index 53% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh index faf08587d49..c2c380597d9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh @@ -14,10 +14,9 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_FUNCTOR_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_FUNCTOR_IMPL_CUH_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_FUNCTOR_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_FUNCTOR_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" enum ScatterNdFunctorType { SCATTER_ND_FUNC_UPDATE = 0, @@ -27,8 +26,8 @@ enum ScatterNdFunctorType { }; template -void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, const size_t &num_units, - const size_t &index_depth, const S *out_strides, const S *indices, const T *updates, T *input, - cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, + const size_t &num_units, const size_t &index_depth, const S *out_strides, + const S *indices, const T *updates, T *input, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_FUNCTOR_IMPL_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SCATTER_ND_FUNCTOR_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cu new file mode 100644 index 00000000000..a7619b7fab0 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cu @@ -0,0 +1,49 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh" +#include "include/cuda_fp16.h" + +template +__global__ void Select(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { + output[pos] = cond[pos] ? input_x[pos] : input_y[pos]; + } + return; +} + +template +void CalSelect(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output, + cudaStream_t cuda_stream) { + Select<<>>(size, cond, input_x, input_y, output); + return; +} + +template CUDA_LIB_EXPORT void CalSelect(const size_t size, const bool* cond, const double* input_X, + const double* input_y, double* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSelect(const size_t size, const bool* cond, const float* input_X, + const float* input_y, float* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSelect(const size_t size, const bool* cond, const int* input_X, + const int* input_y, int* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSelect(const size_t size, const bool* cond, const half* input_X, + const half* input_y, half* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSelect(const size_t size, const bool* cond, const int64_t* input_X, + const int64_t* input_y, int64_t* output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSelect(const size_t size, const bool *cond, const bool *input_X, + const bool *input_y, bool *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh similarity index 56% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh index 3ee876061b3..f465fa0f4ea 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh @@ -14,12 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SELECT_IMPL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SELECT_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SELECT_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SELECT_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CalSelect(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output, - cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SELECT_IMPL_H_ +CUDA_LIB_EXPORT void CalSelect(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SELECT_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cu similarity index 81% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cu index 6677c2676d2..f5540d04755 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cu @@ -15,7 +15,7 @@ */ #include -#include "plugin/device/gpu/kernel/cuda_impl/sgd_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cuh" template __global__ void SGDKernel(const int size, const T dampening, const T weight_decay, const bool nesterov, const T *grad, @@ -52,6 +52,6 @@ void SGD(const int size, const T dampening, const T weight_decay, const bool nes lr, param, accum, stat); } -template void SGD(const int size, const float dampening, const float weight_decay, const bool nesterov, const float *lr, - const float *momentum, const float *grad, float *param, float *accum, float *stat, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SGD(const int size, const float dampening, const float weight_decay, const bool nesterov, + const float *lr, const float *momentum, const float *grad, float *param, float *accum, + float *stat, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cuh index 487f88c128c..cc091404674 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sgd_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cuh @@ -14,12 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SGD_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SGD_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SGD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SGD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void SGD(const int size, const T dampening, const T weight_decay, const bool nesterov, const T *lr, const T *momentum, - const T *grad, T *param, T *accum, T *stat, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SGD_IMPL_H_ +CUDA_LIB_EXPORT void SGD(const int size, const T dampening, const T weight_decay, const bool nesterov, const T *lr, + const T *momentum, const T *grad, T *param, T *accum, T *stat, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SGD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cu similarity index 57% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cu index 1119bb138b3..39623c84f76 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void SigmoidCrossEntropyWithLogitsGradKernel(const size_t size, const T *logits, const S *labels, @@ -49,14 +50,23 @@ void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const T *logits, const dout_addr, outputs); } -template void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const half *logits, - const half *labels, const half *dout_addr, - half *outputs, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogitsGrad(const size_t size, + const half *logits, + const half *labels, + const half *dout_addr, + half *outputs, + cudaStream_t cuda_stream); -template void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const float *logits, - const float *labels, const float *dout_addr, - float *outputs, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogitsGrad(const size_t size, + const float *logits, + const float *labels, + const float *dout_addr, + float *outputs, + cudaStream_t cuda_stream); -template void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const double *logits, - const double *labels, const double *dout_addr, - double *outputs, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogitsGrad(const size_t size, + const double *logits, + const double *labels, + const double *dout_addr, + double *outputs, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cuh new file mode 100644 index 00000000000..cdcff8a755a --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cuh @@ -0,0 +1,24 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const T *logits, const S *labels, + const T *dout_addr, T *outputs, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cu similarity index 64% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cu index 73b16c6a123..ec99b655623 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void SigmoidCrossEntropyWithLogitsKernel(const size_t size, const T *logits, const S *labels, T *outputs) { @@ -41,10 +42,12 @@ void SigmoidCrossEntropyWithLogits(const size_t size, const T *logits, const S * SigmoidCrossEntropyWithLogitsKernel<<>>(size, logits, labels, outputs); } -template void SigmoidCrossEntropyWithLogits(const size_t size, const half *logits, const half *labels, - half *outputs, cudaStream_t cuda_stream); -template void SigmoidCrossEntropyWithLogits(const size_t size, const float *logits, const float *labels, - float *outputs, cudaStream_t cuda_stream); -template void SigmoidCrossEntropyWithLogits(const size_t size, const double *logits, - const double *labels, double *outputs, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogits(const size_t size, const half *logits, + const half *labels, half *outputs, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogits(const size_t size, const float *logits, + const float *labels, float *outputs, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogits(const size_t size, const double *logits, + const double *labels, double *outputs, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cuh index dbae8bc4b59..f98055cd75e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cuh @@ -14,12 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const T *logits, const S *labels, const T *dout_addr, - T *outputs, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void SigmoidCrossEntropyWithLogits(const size_t size, const T *logits, const S *labels, T *outputs, + cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cu new file mode 100644 index 00000000000..9844c5753cc --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cu @@ -0,0 +1,133 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh" +#include "include/cuda_fp16.h" + +namespace { +constexpr size_t kMaxDim = 8; +} + +template +class VectorWrapper { + public: + explicit VectorWrapper(const std::vector &v) { std::copy(v.begin(), v.end(), data); } + ~VectorWrapper() {} + __device__ T& operator[](size_t index) { return data[index]; } + + private: + T data[N]; +}; + +template +__global__ void CopySlicesKernel(VectorWrapper begins, VectorWrapper stride, + VectorWrapper u, VectorWrapper u_offset, + VectorWrapper o_offset, const T *update_addr, T *output_addr) { + size_t update_num = u[0] * u_offset[0]; + + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < update_num; pos += blockDim.x * gridDim.x) { + size_t i = pos / (u_offset[0]) % u[0]; + size_t j = pos / (u_offset[1]) % u[1]; + size_t k = pos / (u_offset[2]) % u[2]; + size_t l = pos / (u_offset[3]) % u[3]; + size_t m = pos / (u_offset[4]) % u[4]; + size_t n = pos / (u_offset[5]) % u[5]; + size_t o = pos / (u[7]) % u[6]; + size_t p = pos % u[7]; + + size_t output_idx = (i * stride[0] + begins[0]) * o_offset[0] + (j * stride[1] + begins[1]) * o_offset[1] + + (k * stride[2] + begins[2]) * o_offset[2] + (l * stride[3] + begins[3]) * o_offset[3] + + (m * stride[4] + begins[4]) * o_offset[4] + (n * stride[5] + begins[5]) * o_offset[5] + + (o * stride[6] + begins[6]) * o_offset[6] + (p * stride[7] + begins[7]); + output_addr[output_idx] = update_addr[pos]; + } +} + +std::vector CalculateOffset(const std::vector &shape) { + std::vector offset(kMaxDim); + offset[7] = 1; + offset[6] = offset[7] * shape[7]; + offset[5] = offset[6] * shape[6]; + offset[4] = offset[5] * shape[5]; + offset[3] = offset[4] * shape[4]; + offset[2] = offset[3] * shape[3]; + offset[1] = offset[2] * shape[2]; + offset[0] = offset[1] * shape[1]; + return offset; +} + +template +void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, const T *update, T *output, + cudaStream_t cuda_stream) { + size_t size = std::accumulate(update_shape.begin(), update_shape.end(), 1, std::multiplies()); + + VectorWrapper o_offset(CalculateOffset(output_shape)); + VectorWrapper u_offset(CalculateOffset(update_shape)); + + VectorWrapper begins(begin); + VectorWrapper strides(stride); + VectorWrapper update_shapes(update_shape); + + CopySlicesKernel<<>>(begins, strides, update_shapes, u_offset, + o_offset, update, output); +} + +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const bool *update, bool *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const double *update, double *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const float *update, float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const half *update, half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const int64_t *update, int64_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const int *update, int *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const short *update, short *output, cudaStream_t cuda_stream); // NOLINT +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const int8_t *update, int8_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const uint64_t *update, uint64_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const uint32_t *update, uint32_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const uint16_t *update, uint16_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const unsigned char *update, unsigned char *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const char *update, char *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh similarity index 51% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh index f3c2b1725eb..c72538d9030 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh @@ -14,15 +14,14 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_COPY_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_COPY_IMPL_CUH_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_COPY_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_COPY_IMPL_CUH_ #include #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, const T *update, - T *output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_COPY_IMPL_CUH_ +CUDA_LIB_EXPORT void CopySlices(const std::vector &update_shape, const std::vector &begin, + const std::vector &stride, const std::vector &output_shape, + const T *update, T *output, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_COPY_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cu new file mode 100644 index 00000000000..54a46ed30c1 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cu @@ -0,0 +1,670 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh" +#include "include/cuda_fp16.h" + +template +__global__ void Slice1D(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1; pos += blockDim.x * gridDim.x) { + output[pos] = input[pos + s1]; + } +} + +template +__global__ void Slice2D(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, + const size_t d2, const T *input, T *output) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2; pos += blockDim.x * gridDim.x) { + size_t i = pos / l2 % l1; + size_t j = pos % l2; + + size_t offset = (i + s1) * d2 + (j + s2); + output[pos] = input[offset]; + } +} + +template +__global__ void Slice3D(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, + const size_t l3, const size_t d1, const size_t d2, const size_t d3, const T *input, T *output) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3; pos += blockDim.x * gridDim.x) { + size_t i = pos / (l2 * l3) % l1; + size_t j = pos / l3 % l2; + size_t k = pos % l3; + + size_t offset = (i + s1) * (d2 * d3) + (j + s2) * d3 + (k + s3); + output[pos] = input[offset]; + } +} + +template +__global__ void Slice4D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const T *input, T *output) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4; pos += blockDim.x * gridDim.x) { + size_t i = pos / (l2 * l3 * l4) % l1; + size_t j = pos / (l3 * l4) % l2; + size_t k = pos / l4 % l3; + size_t o = pos % l4; + + size_t offset = (i + s1) * (d2 * d3 * d4) + (j + s2) * (d3 * d4) + (k + s3) * d4 + (o + s4); + output[pos] = input[offset]; + } +} + +template +__global__ void Slice5D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, + const T *input, T *output) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5; + pos += blockDim.x * gridDim.x) { + size_t i = pos / (l2 * l3 * l4 * l5) % l1; + size_t j = pos / (l3 * l4 * l5) % l2; + size_t k = pos / (l4 * l5) % l3; + size_t o = pos / l5 % l4; + size_t q = pos % l5; + + size_t offset = + (i + s1) * (d2 * d3 * d4 * d5) + (j + s2) * (d3 * d4 * d5) + (k + s3) * (d4 * d5) + (o + s4) * d5 + (q + s5); + output[pos] = input[offset]; + } +} + +template +__global__ void Slice6D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, + const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, + const size_t d4, const size_t d5, const size_t d6, const T *input, T *output) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5 * l6; + pos += blockDim.x * gridDim.x) { + size_t i = pos / (l2 * l3 * l4 * l5 * l6) % l1; + size_t j = pos / (l3 * l4 * l5 * l6) % l2; + size_t k = pos / (l4 * l5 * l6) % l3; + size_t o = pos / (l5 * l6) % l4; + size_t q = pos / l6 % l5; + size_t r = pos % l6; + + size_t offset = + (i + s1) * (d2 * d3 * d4 * d5 * d6) + (j + s2) * (d3 * d4 * d5 * d6) + (k + s3) * (d4 * d5 * d6) + (o + s4) * + (d5 * d6) + (q + s5) * d6 + (r + s6); + output[pos] = input[offset]; + } +} + +template +__global__ void Slice7D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, + const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, + const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, + const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const size_t d7, const T *input, T *output) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5 * l6 * l7; + pos += blockDim.x * gridDim.x) { + size_t i = pos / (l2 * l3 * l4 * l5 * l6 * l7) % l1; + size_t j = pos / (l3 * l4 * l5 * l6 * l7) % l2; + size_t k = pos / (l4 * l5 * l6 * l7) % l3; + size_t o = pos / (l5 * l6 * l7) % l4; + size_t q = pos / (l6 * l7) % l5; + size_t r = pos / l7 % l6; + size_t s = pos % l7; + + size_t offset = + (i + s1) * (d2 * d3 * d4 * d5 * d6 * d7) + (j + s2) * (d3 * d4 * d5 * d6 * d7) + (k + s3) * (d4 * d5 * d6 * d7)+ + (o + s4) * (d5 * d6 * d7) + (q + s5) * (d6 * d7) + (r + s6) * d7 + (s + s7); + output[pos] = input[offset]; + } +} + +template +__global__ void Slice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const T *dy, T *dx) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (l1 * l2 * l3 * l4); pos += blockDim.x * gridDim.x) { + size_t i = pos / (l2 * l3 * l4) % l1; + size_t j = pos / (l3 * l4) % l2; + size_t k = pos / l4 % l3; + size_t o = pos % l4; + size_t input_idx = (i + s1) * (d2 * d3 * d4) + (j + s2) * (d3 * d4) + (k + s3) * d4 + (o + s4); + dx[input_idx] = dy[pos]; + } +} + +template +__global__ void FillArray(T *addr, const size_t len, const float value) { + T value_ = static_cast(value); + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < len; pos += blockDim.x * gridDim.x) { + addr[pos] = value_; + } + return; +} +template +void FillDeviceArray(const size_t input_size, T *addr, const float value, cudaStream_t cuda_stream) { + FillArray<<>>(addr, input_size, value); + return; +} +template +void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output, cudaStream_t stream) { + Slice1D<<>>(s1, l1, d1, input, output); +} +template +void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, const size_t d2, + const T *input, T *output, cudaStream_t stream) { + Slice2D<<>>(s1, s2, l1, l2, d1, d2, input, output); +} +template +void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, const size_t l3, + const size_t d1, const size_t d2, const size_t d3, const T *input, T *output, cudaStream_t stream) { + Slice3D<<>>(s1, s2, s3, l1, l2, l3, d1, d2, d3, input, output); +} +template +void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const T *input, T *output, cudaStream_t stream) { + Slice4D<<>>(s1, s2, s3, s4, l1, l2, l3, l4, d1, d2, d3, d4, + input, output); +} +template +void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const T *input, T *output, cudaStream_t stream) { + Slice5D<<>>(s1, s2, s3, s4, s5, l1, l2, l3, l4, l5, d1, + d2, d3, d4, d5, input, output); +} +template +void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t l6, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const T *input, T *output, cudaStream_t stream) { + Slice6D<<>>(s1, s2, s3, s4, s5, s6, l1, l2, l3, l4, + l5, l6, d1, d2, d3, d4, d5, d6, input, + output); +} +template +void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6, + const size_t s7, const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t l6, const size_t l7, const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const size_t d5, const size_t d6, const size_t d7, const T *input, T *output, cudaStream_t stream) { + Slice7D<<>>(s1, s2, s3, s4, s5, s6, s7, l1, l2, + l3, l4, l5, l6, l7, d1, d2, d3, d4, + d5, d6, d7, input, output); +} +template +void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const T *dy, T *dx, cudaStream_t stream) { + Slice4DGrad<<>>(s1, s2, s3, s4, l1, l2, l3, l4, d1, d2, d3, d4, + dy, dx); +} + +template +__global__ void StridedSliceKernel(const size_t b0, const size_t b1, const size_t b2, const size_t b3, const size_t b4, + const size_t b5, const size_t b6, const size_t s0, const size_t s1, const size_t s2, + const size_t s3, const size_t s4, const size_t s5, const size_t s6, const size_t i0, + const size_t i1, const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t o0, const size_t o1, const size_t o2, const size_t o3, + const size_t o4, const size_t o5, const size_t o6, const T *input_addr, + T *output_addr) { + size_t output_num = o0 * o1 * o2 * o3 * o4 * o5 * o6; + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) { + size_t i = pos / (o1 * o2 * o3 * o4 * o5 * o6) % o0; + size_t j = pos / (o2 * o3 * o4 * o5 * o6) % o1; + size_t k = pos / (o3 * o4 * o5 * o6) % o2; + size_t l = pos / (o4 * o5 * o6) % o3; + size_t m = pos / (o5 * o6) % o4; + size_t n = pos / (o6) % o5; + size_t o = pos % o6; + + size_t input_idx = (i * s0 + b0) * i1 * i2 * i3 * i4 * i5 * i6 + (j * s1 + b1) * i2 * i3 * i4 * i5 * i6 + + (k * s2 + b2) * i3 * i4 * i5 * i6 + (l * s3 + b3) * i4 * i5 * i6 + (m * s4 + b4) * i5 * i6 + + (n * s5 + b5) * i6 + (o * s6 + b6); + output_addr[pos] = input_addr[input_idx]; + } +} + +template +void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, const T *input, + T *output, cudaStream_t cuda_stream) { + size_t size = output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3] * output_shape[4] * + output_shape[5] * output_shape[6]; + StridedSliceKernel<<>>( + begin[0], begin[1], begin[2], begin[3], begin[4], begin[5], begin[6], strides[0], strides[1], strides[2], + strides[3], strides[4], strides[5], strides[6], input_shape[0], input_shape[1], input_shape[2], input_shape[3], + input_shape[4], input_shape[5], input_shape[6], output_shape[0], output_shape[1], output_shape[2], output_shape[3], + output_shape[4], output_shape[5], output_shape[6], input, output); +} + +template +__global__ void StridedSliceGradKernel(const size_t b0, const size_t b1, const size_t b2, const size_t b3, + const size_t b4, const size_t b5, const size_t b6, const size_t s0, + const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t o0, const size_t o1, const size_t o2, + const size_t o3, const size_t o4, const size_t o5, const size_t o6, const T *dy, + T *dx) { + size_t output_num = o0 * o1 * o2 * o3 * o4 * o5 * o6; + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) { + size_t i = pos / (o1 * o2 * o3 * o4 * o5 * o6) % o0; + size_t j = pos / (o2 * o3 * o4 * o5 * o6) % o1; + size_t k = pos / (o3 * o4 * o5 * o6) % o2; + size_t l = pos / (o4 * o5 * o6) % o3; + size_t m = pos / (o5 * o6) % o4; + size_t n = pos / (o6) % o5; + size_t o = pos % o6; + + size_t input_idx = (i * s0 + b0) * i1 * i2 * i3 * i4 * i5 * i6 + (j * s1 + b1) * i2 * i3 * i4 * i5 * i6 + + (k * s2 + b2) * i3 * i4 * i5 * i6 + (l * s3 + b3) * i4 * i5 * i6 + (m * s4 + b4) * i5 * i6 + + (n * s5 + b5) * i6 + (o * s6 + b6); + (n * s5 + b5) * i6 + (o * s6 + b6); + dx[input_idx] = dy[pos]; + } + return; +} + +template +void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, const T *dy, T *dx, + cudaStream_t cuda_stream) { + size_t size = dy_shape[0] * dy_shape[1] * dy_shape[2] * dy_shape[3] * dy_shape[4] * dy_shape[5] * dy_shape[6]; + StridedSliceGradKernel<<>>( + begin[0], begin[1], begin[2], begin[3], begin[4], begin[5], begin[6], strides[0], strides[1], strides[2], + strides[3], strides[4], strides[5], strides[6], dx_shape[0], dx_shape[1], dx_shape[2], dx_shape[3], dx_shape[4], + dx_shape[5], dx_shape[6], dy_shape[0], dy_shape[1], dy_shape[2], dy_shape[3], dy_shape[4], dy_shape[5], dy_shape[6], + dy, dx); +} + +template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const double *input, + double *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const float *input, + float *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const half *input, + half *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const int *input, + int *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const short *input, // NOLINT + short *output, cudaStream_t stream); // NOLINT +template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, + const unsigned char *input, unsigned char *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const int64_t *input, + int64_t *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const bool *input, + bool *output, cudaStream_t stream); + +template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, + const size_t d1, const size_t d2, const double *input, double *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, + const size_t d1, const size_t d2, const float *input, float *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, + const size_t d1, const size_t d2, const half *input, half *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, + const size_t d1, const size_t d2, const int *input, int *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, + const size_t d1, const size_t d2, const short *input, short *output, // NOLINT + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, + const size_t d1, const size_t d2, const unsigned char *input, + unsigned char *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, + const size_t d1, const size_t d2, const int64_t *input, int64_t *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, + const size_t d1, const size_t d2, const bool *input, bool *output, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, + const size_t l2, const size_t l3, const size_t d1, const size_t d2, + const size_t d3, const double *input, double *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, + const size_t l2, const size_t l3, const size_t d1, const size_t d2, + const size_t d3, const float *input, float *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, + const size_t l2, const size_t l3, const size_t d1, const size_t d2, + const size_t d3, const half *input, half *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, + const size_t l2, const size_t l3, const size_t d1, const size_t d2, + const size_t d3, const int *input, int *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, + const size_t l2, const size_t l3, const size_t d1, const size_t d2, + const size_t d3, const short *input, short *output, cudaStream_t stream); // NOLINT +template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, + const size_t l2, const size_t l3, const size_t d1, const size_t d2, + const size_t d3, const unsigned char *input, unsigned char *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, + const size_t l2, const size_t l3, const size_t d1, const size_t d2, + const size_t d3, const int64_t *input, int64_t *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, + const size_t l2, const size_t l3, const size_t d1, const size_t d2, + const size_t d3, const bool *input, bool *output, cudaStream_t stream); + +template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const double *input, double *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const float *input, float *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const half *input, half *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const int *input, int *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const short *input, short *output, cudaStream_t stream); // NOLINT +template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const unsigned char *input, unsigned char *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const int64_t *input, int64_t *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const bool *input, bool *output, cudaStream_t stream); + +template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t l1, const size_t l2, const size_t l3, + const size_t l4, const size_t l5, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const double *input, + double *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t l1, const size_t l2, const size_t l3, + const size_t l4, const size_t l5, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const float *input, + float *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t l1, const size_t l2, const size_t l3, + const size_t l4, const size_t l5, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const half *input, + half *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t l1, const size_t l2, const size_t l3, + const size_t l4, const size_t l5, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const int64_t *input, + int64_t *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t l1, const size_t l2, const size_t l3, + const size_t l4, const size_t l5, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const int *input, + int *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t l1, const size_t l2, const size_t l3, + const size_t l4, const size_t l5, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const short *input, // NOLINT + short *output, cudaStream_t stream); // NOLINT +template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t l1, const size_t l2, const size_t l3, + const size_t l4, const size_t l5, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, + const unsigned char *input, unsigned char *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t l1, const size_t l2, const size_t l3, + const size_t l4, const size_t l5, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const bool *input, + bool *output, cudaStream_t stream); + +template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t l5, const size_t l6, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const size_t d5, const size_t d6, const double *input, double *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t l5, const size_t l6, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const size_t d5, const size_t d6, const float *input, float *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t l5, const size_t l6, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const size_t d5, const size_t d6, const half *input, half *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t l5, const size_t l6, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const size_t d5, const size_t d6, const int64_t *input, int64_t *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t l5, const size_t l6, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const size_t d5, const size_t d6, const int *input, int *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t l5, const size_t l6, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const size_t d5, const size_t d6, const short *input, short *output, // NOLINT + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t l5, const size_t l6, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const size_t d5, const size_t d6, const unsigned char *input, + unsigned char *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t l5, const size_t l6, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const size_t d5, const size_t d6, const bool *input, bool *output, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t s7, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t l6, const size_t l7, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const size_t d7, const double *input, double *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t s7, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t l6, const size_t l7, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const size_t d7, const float *input, float *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t s7, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t l6, const size_t l7, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const size_t d7, const half *input, half *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t s7, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t l6, const size_t l7, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const size_t d7, const int64_t *input, int64_t *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t s7, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t l6, const size_t l7, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const size_t d7, const int *input, int *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t s7, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t l6, const size_t l7, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const size_t d7, const short *input, short *output, cudaStream_t stream); // NOLINT +template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t s7, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t l6, const size_t l7, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const size_t d7, const unsigned char *input, unsigned char *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t s5, const size_t s6, const size_t s7, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t l6, const size_t l7, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const size_t d7, const bool *input, bool *output, cudaStream_t stream); + +template CUDA_LIB_EXPORT void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const double *dy, double *dx, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const float *dy, float *dx, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const half *dy, half *dx, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const int *dy, int *dx, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, // NOLINT + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const short *dy, short *dx, cudaStream_t stream); // NOLINT +template CUDA_LIB_EXPORT void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, + const size_t s4, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t d1, + const size_t d2, const size_t d3, const size_t d4, + const unsigned char *dy, unsigned char *dx, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, + const size_t s4, const size_t l1, const size_t l2, + const size_t l3, const size_t l4, const size_t d1, + const size_t d2, const size_t d3, const size_t d4, + const int64_t *dy, int64_t *dx, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, + const bool *dy, bool *dx, cudaStream_t stream); + +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, bool *addr, const float value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, int64_t *addr, const float value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, int *addr, const float value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, short *addr, const float value, // NOLINT + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, int8_t *addr, const float value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, uint64_t *addr, const float value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, uint32_t *addr, const float value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, uint16_t *addr, const float value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, unsigned char *addr, + const float value, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, half *addr, const float value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, float *addr, const float value, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, double *addr, const float value, + cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const bool *input, bool *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const double *input, double *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const float *input, float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const half *input, half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const int64_t *input, int64_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const int *input, int *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const short *input, short *output, cudaStream_t cuda_stream); // NOLINT +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const int8_t *input, int8_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const uint64_t *input, uint64_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const uint32_t *input, uint32_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const uint16_t *input, uint16_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const unsigned char *input, unsigned char *output, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const bool *dy, bool *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const double *dy, double *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const float *dy, float *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const half *dy, half *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const int64_t *dy, int64_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const int *dy, int *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const short *dy, short *dx, cudaStream_t cuda_stream); // NOLINT +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const int8_t *dy, int8_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const uint64_t *dy, uint64_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const uint32_t *dy, uint32_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const uint16_t *dy, uint16_t *dx, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const unsigned char *dy, unsigned char *dx, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh new file mode 100644 index 00000000000..86fd9238d27 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh @@ -0,0 +1,83 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_IMPL_CUH_ +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void SliceKernel(const T *input, T *output, const size_t output_size, cudaStream_t cuda_stream, + S...pack); + +template +CUDA_LIB_EXPORT void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const T *dy, T *dx, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output, + cudaStream_t stream); + +template +CUDA_LIB_EXPORT void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, + const size_t d2, const T *input, T *output, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, + const size_t l3, const size_t d1, const size_t d2, const size_t d3, const T *input, + T *output, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, + const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, + const size_t d3, const size_t d4, const T *input, T *output, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, + const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, + const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, + const T *input, T *output, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, + const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, + const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, + const size_t d4, const size_t d5, const size_t d6, const T *input, T *output, + cudaStream_t stream); + +template +CUDA_LIB_EXPORT void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, + const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, + const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, + const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, + const size_t d7, const T *input, T *output, cudaStream_t stream); + +template +CUDA_LIB_EXPORT void StridedSlice(const std::vector &input_shape, const std::vector &begin, + const std::vector &strides, const std::vector &output_shape, + const T *input, T *output, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, + const std::vector &strides, const std::vector &dx_shape, + const T *dy, T *dx, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void FillDeviceArray(const size_t input_size, T *addr, const float value, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SLICE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cu similarity index 67% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cu index 028511192c4..f87f1790c11 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cu @@ -15,7 +15,7 @@ */ #include "smooth_l1_loss_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "include/cuda_fp16.h" template __global__ void SmoothL1LossKernel(const int input_size, const float beta, const T *prediction, const T *target, @@ -87,17 +87,20 @@ void SmoothL1LossGrad(const int &input_size, const float &beta, const T *predict SmoothL1LossGradKernel<<>>(input_size, beta, prediction, target, dloss, dx); } -template void SmoothL1Loss(const int &input_size, const float &beta, const double *prediction, - const double *target, double *loss, cudaStream_t stream); -template void SmoothL1LossGrad(const int &input_size, const float &beta, const double *prediction, - const double *target, const double *dloss, double *dx, cudaStream_t stream); +template CUDA_LIB_EXPORT void SmoothL1Loss(const int &input_size, const float &beta, const double *prediction, + const double *target, double *loss, cudaStream_t stream); +template CUDA_LIB_EXPORT void SmoothL1LossGrad(const int &input_size, const float &beta, + const double *prediction, const double *target, + const double *dloss, double *dx, cudaStream_t stream); -template void SmoothL1Loss(const int &input_size, const float &beta, const float *prediction, - const float *target, float *loss, cudaStream_t stream); -template void SmoothL1LossGrad(const int &input_size, const float &beta, const float *prediction, - const float *target, const float *dloss, float *dx, cudaStream_t stream); +template CUDA_LIB_EXPORT void SmoothL1Loss(const int &input_size, const float &beta, const float *prediction, + const float *target, float *loss, cudaStream_t stream); +template CUDA_LIB_EXPORT void SmoothL1LossGrad(const int &input_size, const float &beta, const float *prediction, + const float *target, const float *dloss, float *dx, + cudaStream_t stream); -template void SmoothL1Loss(const int &input_size, const float &beta, const half *prediction, - const half *target, half *loss, cudaStream_t stream); -template void SmoothL1LossGrad(const int &input_size, const float &beta, const half *prediction, - const half *target, const half *dloss, half *dx, cudaStream_t stream); +template CUDA_LIB_EXPORT void SmoothL1Loss(const int &input_size, const float &beta, const half *prediction, + const half *target, half *loss, cudaStream_t stream); +template CUDA_LIB_EXPORT void SmoothL1LossGrad(const int &input_size, const float &beta, const half *prediction, + const half *target, const half *dloss, half *dx, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cuh new file mode 100644 index 00000000000..d943a5a65c6 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SMOOTH_L1_LOSS_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SMOOTH_L1_LOSS_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void SmoothL1Loss(const int &input_size, const float &beta, const T *prediction, const T *target, + T *loss, cudaStream_t stream); +template +CUDA_LIB_EXPORT void SmoothL1LossGrad(const int &input_size, const float &beta, const T *prediction, const T *target, + const T *dloss, T *dx, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SMOOTH_L1_LOSS_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cu similarity index 78% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cu index 17a120c95b6..ebc3dab9593 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/softplus_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void SoftplusKernel(const size_t size, const T *input_addr, T *output_addr) { @@ -71,9 +71,11 @@ void SoftplusGrad(const size_t size, const half *dy_addr, const half *x_addr, ha SoftplusGradKernel<<>>(size, dy_addr, x_addr, dx_addr); } -template void Softplus(const size_t size, const float *input_addr, float *output_addr, cudaStream_t cuda_stream); -template void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t cuda_stream); -template void SoftplusGrad(const size_t size, const float *dy_addr, const float *x_addr, float *dx_addr, - cudaStream_t cuda_stream); -template void SoftplusGrad(const size_t size, const half *dy_addr, const half *x_addr, half *dx_addr, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Softplus(const size_t size, const float *input_addr, float *output_addr, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Softplus(const size_t size, const half *input_addr, half *output_addr, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SoftplusGrad(const size_t size, const float *dy_addr, const float *x_addr, float *dx_addr, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SoftplusGrad(const size_t size, const half *dy_addr, const half *x_addr, half *dx_addr, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh similarity index 51% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh index 4b8fad79441..c5bcd49cd20 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh @@ -14,14 +14,14 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GELU_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GELU_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SOFTPLUS_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SOFTPLUS_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void Gelu(size_t input_size, T* input_addr, T* output_addr, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void Softplus(const size_t input_size, const T* input_addr, T* output_addr, cudaStream_t cuda_stream); template -void GeluGradKernel(size_t size, T* dy_addr, T* x_addr, T* dx_addr, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void SoftplusGrad(const size_t size, const T* dy_addr, const T* x_addr, T* dx_addr, + cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_GELU_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SOFTPLUS_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cu new file mode 100644 index 00000000000..279681a2f2b --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cu @@ -0,0 +1,140 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "spacetobatch_impl.cuh" +#include "include/cuda_fp16.h" + +template +__global__ void SpaceToBatch(const size_t size, const T *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, const size_t block_num, + T *output) { + size_t temp_stride = 0; + size_t temp_pos = 0; + size_t idx_in = 0; + size_t idx_ic = 0; + size_t idx_ih = 0; + size_t idx_iw = 0; + size_t idx_on = 0; + size_t output_pos = 0; + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; + pos += blockDim.x * gridDim.x) { + temp_stride = ic * ih * iw; + idx_in = pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= ic; + idx_ic = temp_pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= ih; + idx_ih = temp_pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= iw; + idx_iw = temp_pos / temp_stride; + + idx_on = (((idx_ih + pad_up) % block_num) * block_num + ((idx_iw + pad_lft) % block_num)) * in + idx_in; + output_pos = idx_on * oc; + output_pos = (output_pos + idx_ic) * oh; + output_pos = (output_pos + ((idx_ih + pad_up) - (idx_on / (in * block_num))) / block_num) * ow; + output_pos = (output_pos + ((idx_iw + pad_lft) - ((idx_on / in) % block_num)) / block_num); + output[output_pos] = input[pos]; + } + return; +} + +template +void CalSpaceToBatch(const size_t size, const T *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, const size_t block_num, + T *output, cudaStream_t cuda_stream) { + cudaMemset(output, 0, on * oc * oh * ow * sizeof(T)); + SpaceToBatch<<>>( + size, input, in, ih, iw, ic, on, oh, ow, oc, pad_up, pad_dn, pad_lft, pad_rht, block_num, output); + return; +} + +template CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const float *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, const size_t block_num, + float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const half *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, const size_t block_num, + half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const int *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, const size_t block_num, + int *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const int64_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, + const size_t block_num, int64_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const int16_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, + const size_t block_num, int16_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const int8_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, + const size_t block_num, int8_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const uint8_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, + const size_t block_num, uint8_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const uint16_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, + const size_t block_num, uint16_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const uint32_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, + const size_t block_num, uint32_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const uint64_t *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, + const size_t block_num, uint64_t *output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cuh new file mode 100644 index 00000000000..c9f20719999 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cuh @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETOBATCH_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETOBATCH_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalSpaceToBatch(const size_t size, const T *input, const size_t in, + const size_t ih, const size_t iw, const size_t ic, + const size_t on, const size_t oh, const size_t ow, + const size_t oc, const size_t pad_up, const size_t pad_dn, + const size_t pad_lft, const size_t pad_rht, const size_t block_num, + T *output, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETOBATCH_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cu new file mode 100644 index 00000000000..562e40e7af7 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cu @@ -0,0 +1,138 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "spacetodepth_impl.cuh" +#include "include/cuda_fp16.h" + +template +__global__ void SpaceToDepth(const size_t size, const T *input, const size_t in, + const size_t ic, const size_t ih, const size_t iw, + const size_t on, const size_t oc, const size_t oh, + const size_t ow, const size_t r, T *output) { + size_t temp_stride = 0; + size_t temp_pos = 0; + size_t output_pos = 0; + size_t input_pos_array[SPACETODEPTH_BUFFER_DIMENSION]; + + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; + pos += blockDim.x * gridDim.x) { + temp_stride = ic * ih * iw; + input_pos_array[0] = pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= ic; + input_pos_array[1] = temp_pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= ih; + input_pos_array[2] = temp_pos / temp_stride; + temp_pos = pos % temp_stride; + + temp_stride /= iw; + input_pos_array[3] = temp_pos / temp_stride; + + output_pos += input_pos_array[0]; + output_pos = (output_pos * oc) + + (input_pos_array[1] + + (r * (input_pos_array[2] % r) + input_pos_array[3] % r) * ic); + output_pos = (output_pos * oh) + (input_pos_array[2] / r); + output_pos = (output_pos * ow) + (input_pos_array[3] / r); + + output[output_pos] = input[pos]; + output_pos = 0; + } + return; +} + +template +void CalSpaceToDepth(const size_t size, const T *input, const size_t in, + const size_t ic, const size_t ih, const size_t iw, + const size_t on, const size_t oc, const size_t oh, + const size_t ow, const size_t r, T *output, + cudaStream_t cuda_stream) { + SpaceToDepth<<>>( + size, input, in, ic, ih, iw, on, oc, oh, ow, r, output); + return; +} + +template CUDA_LIB_EXPORT void CalSpaceToDepth(const size_t size, const float *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToDepth(const size_t size, const half *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, half *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToDepth(const size_t size, const int *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, int *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToDepth(const size_t size, const int64_t *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, int64_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToDepth(const size_t size, const int16_t *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, int16_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToDepth(const size_t size, const int8_t *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, int8_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSpaceToDepth(const size_t size, const uint8_t *input, + const size_t in, const size_t ic, + const size_t ih, const size_t iw, + const size_t on, const size_t oc, + const size_t oh, const size_t ow, + const size_t r, uint8_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void +CalSpaceToDepth(const size_t size, const uint16_t *input, + const size_t in, const size_t ic, const size_t ih, + const size_t iw, const size_t on, const size_t oc, + const size_t oh, const size_t ow, const size_t r, + uint16_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void +CalSpaceToDepth(const size_t size, const uint32_t *input, + const size_t in, const size_t ic, const size_t ih, + const size_t iw, const size_t on, const size_t oc, + const size_t oh, const size_t ow, const size_t r, + uint32_t *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void +CalSpaceToDepth(const size_t size, const uint64_t *input, + const size_t in, const size_t ic, const size_t ih, + const size_t iw, const size_t on, const size_t oc, + const size_t oh, const size_t ow, const size_t r, + uint64_t *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cuh new file mode 100644 index 00000000000..7dfcc853232 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cuh @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETODEPTH_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETODEPTH_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#define SPACETODEPTH_BUFFER_DIMENSION 4 +template +CUDA_LIB_EXPORT void CalSpaceToDepth(const size_t size, const T *input, const size_t in, + const size_t ic, const size_t ih, const size_t iw, + const size_t on, const size_t oc, const size_t oh, + const size_t ow, const size_t r, T *output, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPACETODEPTH_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cu similarity index 68% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cu index 41a33427509..01711a947b7 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cu @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cuh" +#include "include/cuda_fp16.h" template __device__ __forceinline__ bool CompareFunc(T x, T y) { @@ -90,14 +91,18 @@ void CalSparseApplyProximalAdagrad(const size_t size, const size_t indices_size, variable_out, accumulation_out); } -template void CalSparseApplyProximalAdagrad(const size_t size, const size_t indices_size, - const float *learning_rate, const float *l1_regularization, - const float *l2_regularization, const float *gradient, - const int *indices, float *variable, float *accumulation, - float *variable_out, float *accumulation_out, - cudaStream_t cuda_stream); -template void CalSparseApplyProximalAdagrad(const size_t size, const size_t indices_size, - const half *learning_rate, const half *l1_regularization, - const half *l2_regularization, const half *gradient, - const int *indices, half *variable, half *accumulation, - half *variable_out, half *accumulation_out, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSparseApplyProximalAdagrad(const size_t size, const size_t indices_size, + const float *learning_rate, + const float *l1_regularization, + const float *l2_regularization, + const float *gradient, const int *indices, + float *variable, float *accumulation, + float *variable_out, float *accumulation_out, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSparseApplyProximalAdagrad(const size_t size, const size_t indices_size, + const half *learning_rate, + const half *l1_regularization, + const half *l2_regularization, const half *gradient, + const int *indices, half *variable, + half *accumulation, half *variable_out, + half *accumulation_out, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cuh new file mode 100644 index 00000000000..c866f966352 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalSparseApplyProximalAdagrad(const size_t size, const size_t indices_size, const T *learning_rate, + const T *l1_regularization, const T *l2_regularization, + const T *gradient, const int *indices, T *variable, T *accumulation, + T *variable_out, T *accumulation_out, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cu similarity index 72% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cu index b549c5bd4ee..a31c738a318 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cu @@ -16,7 +16,6 @@ #include #include "sparse_cross_entropy_cuda_impl.cuh" -#include "include/cuda_runtime.h" template __global__ void CalCrossEntropyKernel(const float *logits, T *labels, const int batch_size, const int class_num, @@ -67,11 +66,11 @@ void CalCrossEntropyGrad(const float *logits, T *labels, const int batch_size, c return; } -template void CalCrossEntropy(const float *logits, int *labels, const int batch_size, const int class_num, - float *loss, cudaStream_t cuda_stream); -template void CalCrossEntropy(const float *logits, uint64_t *labels, const int batch_size, - const int class_num, float *loss, cudaStream_t cuda_stream); -template void CalCrossEntropyGrad(const float *logits, int *labels, const int batch_size, const int class_num, - float *grad, cudaStream_t cuda_stream); -template void CalCrossEntropyGrad(const float *logits, uint64_t *labels, const int batch_size, - const int class_num, float *grad, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCrossEntropy(const float *logits, int *labels, const int batch_size, + const int class_num, float *loss, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCrossEntropy(const float *logits, uint64_t *labels, const int batch_size, + const int class_num, float *loss, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCrossEntropyGrad(const float *logits, int *labels, const int batch_size, + const int class_num, float *grad, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalCrossEntropyGrad(const float *logits, uint64_t *labels, const int batch_size, + const int class_num, float *grad, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cuh new file mode 100755 index 00000000000..5c7d2ee4065 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_cross_entropy_cuda_impl.cuh @@ -0,0 +1,29 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_CROSS_ENTROPY_CUDA_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_CROSS_ENTROPY_CUDA_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void CalCrossEntropy(const float *logits, T *labels, const int batch_size, const int class_num, + float *loss, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CalCrossEntropyGrad(const float *logits, T *labels, const int batch_size, const int class_num, + float *grad, cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_CROSS_ENTROPY_CUDA_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cu similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cu index 19172b48023..78b7b6b4b10 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cu @@ -15,7 +15,6 @@ */ #include "sparse_ftrl_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" #include "include/cuda_fp16.h" template @@ -89,24 +88,35 @@ void CalSparseApplyFtrl(const T *gradient, const S *indices, const int num_index n_stride, learning_rate, l1_regularization, l2_regularization, learning_rate_power, variable, accumulation, linear); } -template void CalSparseApplyFtrl(const float *gradient, const int *indices, const int num_index, - const size_t n_stride, const float learning_rate, - const float l1_regularization, const float l2_regularization, - const float learning_rate_power, const bool use_locking, float *variable, - float *accumulation, float *linear, cudaStream_t cuda_stream); -template void CalSparseApplyFtrl(const float *gradient, const int64_t *indices, const int num_index, - const size_t n_stride, const float learning_rate, - const float l1_regularization, const float l2_regularization, - const float learning_rate_power, const bool use_locking, float *variable, - float *accumulation, float *linear, cudaStream_t cuda_stream); -template void CalSparseApplyFtrl(const half *gradient, const int *indices, const int num_index, - const size_t n_stride, const float learning_rate, - const float l1_regularization, const float l2_regularization, - const float learning_rate_power, const bool use_locking, half *variable, - half *accumulation, half *linear, cudaStream_t cuda_stream); -template void CalSparseApplyFtrl(const half *gradient, const int64_t *indices, const int num_index, - const size_t n_stride, const float learning_rate, - const float l1_regularization, const float l2_regularization, - const float learning_rate_power, const bool use_locking, half *variable, - half *accumulation, half *linear, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSparseApplyFtrl(const float *gradient, const int *indices, + const int num_index, const size_t n_stride, + const float learning_rate, const float l1_regularization, + const float l2_regularization, + const float learning_rate_power, const bool use_locking, + float *variable, float *accumulation, float *linear, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSparseApplyFtrl(const float *gradient, const int64_t *indices, + const int num_index, const size_t n_stride, + const float learning_rate, + const float l1_regularization, + const float l2_regularization, + const float learning_rate_power, + const bool use_locking, float *variable, + float *accumulation, float *linear, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSparseApplyFtrl(const half *gradient, const int *indices, + const int num_index, const size_t n_stride, + const float learning_rate, const float l1_regularization, + const float l2_regularization, + const float learning_rate_power, const bool use_locking, + half *variable, half *accumulation, half *linear, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSparseApplyFtrl(const half *gradient, const int64_t *indices, + const int num_index, const size_t n_stride, + const float learning_rate, + const float l1_regularization, + const float l2_regularization, + const float learning_rate_power, const bool use_locking, + half *variable, half *accumulation, half *linear, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cuh new file mode 100644 index 00000000000..6020ad2eb27 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cuh @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_FTRL_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_FTRL_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalSparseApplyFtrl(const T *gradient, const S *indices, const int num_index, const size_t n_stride, + const float learning_rate, const float l1_regularization, + const float l2_regularization, const float learning_rate_power, + const bool use_locking, T *variable, T *accumulation, T *linear, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPARSE_FTRL_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cu new file mode 100755 index 00000000000..ef8916db09f --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cu @@ -0,0 +1,63 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh" +#include "include/cuda_fp16.h" +template +__global__ void Split(const size_t size, const int axis_step, const int all_size_before_axis, + const int all_size_axis, const T* input, T** outputs) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { + int num = pos % all_size_before_axis / all_size_axis; + int block = num / axis_step; + int block_pos = pos / all_size_before_axis * axis_step * all_size_axis + + num % axis_step * all_size_axis + pos % all_size_axis; + outputs[block][block_pos] = input[pos]; + } + return; +} + +template +void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, + const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) { + Split<<>>(size, axis_step, all_size_before_axis, + all_size_axis, input, outputs); + return; +} + +template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, + const int all_size_axis, const half* input, half** outputs, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, + const int all_size_axis, const float* input, float** outputs, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, + const int all_size_axis, const double* input, double** outputs, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, + const int all_size_axis, const int* input, int** outputs, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, + const int all_size_axis, const uint32_t* input, uint32_t** outputs, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, + const int all_size_axis, const int64_t* input, int64_t** outputs, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, + const int all_size_axis, const bool* input, bool** outputs, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh similarity index 53% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh index 7ca1593be6f..e3d1f9386e4 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh @@ -14,11 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPLIT_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPLIT_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, - const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_ +CUDA_LIB_EXPORT void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, + const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SPLIT_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cu similarity index 82% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cu index 7feb03a78bc..2d511cb56fe 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" template __global__ void SquareSumAllKernel(const size_t size, const T* input_addr_0, const T* input_addr_1, @@ -92,9 +92,9 @@ void SquareSumAll(const size_t input_size_, const T* input_addr_0, const T* inpu AssignKernel<<>>(1, output_addr_0, output_addr_1, ws_addr_0, ws_addr_1); } -template void SquareSumAll(const size_t input_size_, const half* input_addr_0, const half* input_addr_1, - half* output_addr_0, half* output_addr_1, float* ws_addr_0, float* ws_addr_1, - cudaStream_t cuda_stream); -template void SquareSumAll(const size_t input_size_, const float* input_addr_0, const float* input_addr_1, - float* output_addr_0, float* output_addr_1, float* ws_addr_0, float* ws_addr_1, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SquareSumAll(const size_t input_size_, const half* input_addr_0, const half* input_addr_1, + half* output_addr_0, half* output_addr_1, float* ws_addr_0, float* ws_addr_1, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SquareSumAll(const size_t input_size_, const float* input_addr_0, + const float* input_addr_1, float* output_addr_0, float* output_addr_1, + float* ws_addr_0, float* ws_addr_1, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cuh similarity index 50% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cuh index 81e10d1d49e..207840f5121 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cuh @@ -14,14 +14,12 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SQUARE_SUM_ALL_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SQUARE_SUM_ALL_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void SquareSumAll(const size_t input_size_, const T* input_addr_0, const T* input_addr_1, + T* output_addr_0, T* output_addr_1, float* ws_addr_0, float* ws_addr_1, + cudaStream_t cuda_stream); -template -void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_IMPL_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SQUARE_SUM_ALL_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cu new file mode 100644 index 00000000000..ea2f85f6ed4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cu @@ -0,0 +1,219 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cuh" + +const int kWarpSize = 32; +const int kNumWarps = 32; + +__inline__ __device__ float HalfFloatInputConvert(const half val) { return __half2float(val); } +__inline__ __device__ float HalfFloatInputConvert(const float val) { return val; } +__inline__ __device__ void HalfFloatOutputAssign(const float val, float *arr, int idx) { arr[idx] = val; } +__inline__ __device__ void HalfFloatOutputAssign(const float val, half *arr, int idx) { arr[idx] = __float2half(val); } + +template +__global__ void SyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, + G *saved_mean, G *saved_invstd, float *dy_sum_local, float *dot_p_local) { + // block level memory + __shared__ float shared_dy[kNumWarps]; + __shared__ float shared_dot_p[kNumWarps]; + int warpId = threadIdx.x / kWarpSize; // threads are arranged in warps of 32 executed together + int laneId = threadIdx.x % kWarpSize; + + int plane = blockIdx.x; // this thread will only function on a single plane + int plane_size = N * H * W; + float mean = static_cast(saved_mean[plane]); + + if (threadIdx.x < kNumWarps) { + shared_dy[threadIdx.x] = static_cast(0); + shared_dot_p[threadIdx.x] = static_cast(0); + } + + __syncthreads(); // ensure all 0 init complete across all values + + float dy_sum = static_cast(0); + float dot_p = static_cast(0); + + // individual thread level reduction + for (int x = threadIdx.x; x < plane_size; x += blockDim.x) { + int index = (x / (H * W) * C * H * W) + (plane * H * W) + (x % (H * W)); + float input_value = HalfFloatInputConvert(x_input[index]); + float dy_value = HalfFloatInputConvert(dy[index]); + dy_sum += dy_value; + dot_p += (input_value - mean) * dy_value; + } + __syncthreads(); + // warp reduce all values in every value to a single value + for (int offset = kWarpSize / 2; offset > 0; offset /= 2) { + float other_dy_sum = __shfl_down_sync(0xffffffff, dy_sum, offset); + float other_dot_p = __shfl_down_sync(0xffffffff, dot_p, offset); + dy_sum += other_dy_sum; + dot_p += other_dot_p; + } + __syncwarp(); + if (laneId == 0) { + shared_dy[warpId] = dy_sum; + shared_dot_p[warpId] = dot_p; + // one value per warp now + } + __syncthreads(); + if (warpId == 0) { + dy_sum = shared_dy[laneId]; + dot_p = shared_dot_p[laneId]; + __syncwarp(); + for (int offset = kWarpSize / 2; offset > 0; offset /= 2) { + float other_dy = __shfl_down_sync(0xffffffff, dy_sum, offset); + float other_dot_p = __shfl_down_sync(0xffffffff, dot_p, offset); + dy_sum += other_dy; + dot_p += other_dot_p; + } + __syncwarp(); + } + if (threadIdx.x == 0) { + dy_sum_local[plane] = dy_sum; + dot_p_local[plane] = dot_p; + } + return; +} + +template +__global__ void SyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, T *dx, + G *saved_mean, G *saved_invstd, float *dy_sum_red, float *dot_p_red, S *scale, + S *dscale, S *dbias, float epsilon) { + int size = N * C * H * W; + int plane_size = N * H * W; + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { + int block_num = (pos / W) / H; // which of N * C blocks + int plane = block_num % C; + float mean = HalfFloatInputConvert(saved_mean[plane]); + float invstd = HalfFloatInputConvert(saved_invstd[plane]); + float scale_value = HalfFloatInputConvert(scale[plane]); + float div_factor = HalfFloatInputConvert(1) / plane_size; + float dy_sum_plane = dy_sum_red[plane]; + float dot_p_plane = dot_p_red[plane]; + float grad_mean = dy_sum_plane * div_factor; + float proj_scale = dot_p_plane * div_factor * invstd * invstd; + float grad_scale = invstd * scale_value; + float inp = HalfFloatInputConvert(x_input[pos]); + float proj = (inp - mean) * proj_scale; + HalfFloatOutputAssign((HalfFloatInputConvert(dy[pos]) - proj - grad_mean) * grad_scale, dx, pos); + } +} + +template +__global__ void SyncBatchNormGradPostScaleBias(size_t C, G *saved_invstd, float *dy_sum_red, float *dot_p_red, + S *dscale, S *dbias) { + for (size_t plane = blockIdx.x * blockDim.x + threadIdx.x; plane < C; plane += blockDim.x * gridDim.x) { + float invstd = HalfFloatInputConvert(saved_invstd[plane]); + float dy_sum_plane = dy_sum_red[plane]; + float dot_p_plane = dot_p_red[plane]; + dscale[plane] = static_cast(dot_p_plane * invstd); + dbias[plane] = static_cast(dy_sum_plane); + } +} + +template +void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, G *saved_mean, + G *saved_invstd, float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream) { + SyncBatchNormGradPre<<>>(N, C, H, W, x_input, dy, saved_mean, saved_invstd, + dy_sum_local, dot_p_local); + return; +} +template +void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, T *dx, + G *saved_mean, G *saved_invstd, float *dy_sum_red, float *dot_p_red, S *scale, S *dscale, + S *dbias, float epsilon, cudaStream_t cuda_stream) { + SyncBatchNormGradPost<<>>(N, C, H, W, x_input, dy, dx, saved_mean, saved_invstd, + dy_sum_red, dot_p_red, scale, dscale, dbias, epsilon); + SyncBatchNormGradPostScaleBias<<(GET_THREADS)), 0, cuda_stream>>>( + C, saved_invstd, dy_sum_red, dot_p_red, dscale, dbias); +} +// PRE FUNCTION +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, + const float *x_input, const float *dy, + float *saved_mean, float *saved_invstd, + float *dy_sum_local, float *dot_p_local, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, + const float *x_input, const float *dy, + half *saved_mean, half *saved_invstd, + float *dy_sum_local, float *dot_p_local, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, + const half *x_input, const half *dy, + float *saved_mean, float *saved_invstd, + float *dy_sum_local, float *dot_p_local, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, + const half *x_input, const half *dy, half *saved_mean, + half *saved_invstd, float *dy_sum_local, + float *dot_p_local, cudaStream_t cuda_stream); +// POST FUNCTION +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, + const float *x_input, const float *dy, + float *dx, float *saved_mean, + float *saved_invstd, float *dy_sum_red, + float *dot_p_red, float *scale, + float *dscale, float *dbias, float epsilon, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, + const half *x_input, const half *dy, + half *dx, float *saved_mean, + float *saved_invstd, float *dy_sum_red, + float *dot_p_red, float *scale, + float *dscale, float *dbias, float epsilon, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, + const float *x_input, const float *dy, + float *dx, float *saved_mean, + float *saved_invstd, float *dy_sum_red, + float *dot_p_red, half *scale, half *dscale, + half *dbias, float epsilon, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, + const half *x_input, const half *dy, half *dx, + float *saved_mean, float *saved_invstd, + float *dy_sum_red, float *dot_p_red, + half *scale, half *dscale, half *dbias, + float epsilon, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, + const float *x_input, const float *dy, + float *dx, half *saved_mean, + half *saved_invstd, float *dy_sum_red, + float *dot_p_red, float *scale, + float *dscale, float *dbias, float epsilon, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, + const half *x_input, const half *dy, half *dx, + half *saved_mean, half *saved_invstd, + float *dy_sum_red, float *dot_p_red, + float *scale, float *dscale, float *dbias, + float epsilon, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, + const float *x_input, const float *dy, + float *dx, half *saved_mean, + half *saved_invstd, float *dy_sum_red, + float *dot_p_red, half *scale, half *dscale, + half *dbias, float epsilon, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, + const half *x_input, const half *dy, half *dx, + half *saved_mean, half *saved_invstd, + float *dy_sum_red, float *dot_p_red, + half *scale, half *dscale, half *dbias, + float epsilon, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cuh new file mode 100644 index 00000000000..efaa72e91b9 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cuh @@ -0,0 +1,29 @@ +// /** +// * Copyright 2021 Huawei Technologies Co., Ltd +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, + G *saved_mean, G *invstd_saved, float *dy_sum_local, float *dot_p_local, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, + T *dx, G *saved_mean, G *invstd_saved, float *dy_sum_red, + float *dot_p_red, S *scale, S *dscale, S *dbias, float epsilon, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cu similarity index 57% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cu index e4126b8bf8c..52e1d11f735 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cu @@ -15,9 +15,8 @@ */ #include -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cuh" const int kWarpSize = 32; const int kNumWarps = 32; @@ -199,50 +198,64 @@ void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, const T *input return; } -template void CalSyncBatchNormPre(size_t N, size_t C, size_t H, size_t W, const float *input, int *output_n, - float *output_mean, float *output_var, float epsilon, - cudaStream_t cuda_stream); -template void CalSyncBatchNormPre(size_t N, size_t C, size_t H, size_t W, const half *input, int *output_n, - float *output_mean, float *output_var, float epsilon, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormPre(size_t N, size_t C, size_t H, size_t W, const float *input, + int *output_n, float *output_mean, float *output_var, + float epsilon, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormPre(size_t N, size_t C, size_t H, size_t W, const half *input, + int *output_n, float *output_mean, float *output_var, + float epsilon, cudaStream_t cuda_stream); -template void CalSyncBatchNormGather(size_t N_, size_t C_, size_t H_, size_t W_, int *counts_global, - float *means_global, float *invstds_global, int *counts_local, - float *means_local, float *invstds_local, float *running_mean_output, - float *running_var_output, float *running_mean_input, - float *running_var_input, float epsilon, float momentum, - size_t group_rank, size_t group_size, cudaStream_t cuda_stream); -template void CalSyncBatchNormGather(size_t N_, size_t C_, size_t H_, size_t W_, int *counts_global, - float *means_global, float *invstds_global, int *counts_local, - float *means_local, float *invstds_local, float *running_mean_output, - float *running_var_output, half *running_mean_input, - half *running_var_input, float epsilon, float momentum, - size_t group_rank, size_t group_size, cudaStream_t cuda_stream); -template void CalSyncBatchNormGather(size_t N_, size_t C_, size_t H_, size_t W_, int *counts_global, - float *means_global, float *invstds_global, int *counts_local, - float *means_local, float *invstds_local, half *running_mean_output, - half *running_var_output, float *running_mean_input, - float *running_var_input, float epsilon, float momentum, - size_t group_rank, size_t group_size, cudaStream_t cuda_stream); -template void CalSyncBatchNormGather(size_t N_, size_t C_, size_t H_, size_t W_, int *counts_global, - float *means_global, float *invstds_global, int *counts_local, - float *means_local, float *invstds_local, half *running_mean_output, - half *running_var_output, half *running_mean_input, - half *running_var_input, float epsilon, float momentum, - size_t group_rank, size_t group_size, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGather(size_t N_, size_t C_, size_t H_, size_t W_, + int *counts_global, float *means_global, + float *invstds_global, int *counts_local, + float *means_local, float *invstds_local, + float *running_mean_output, + float *running_var_output, float *running_mean_input, + float *running_var_input, float epsilon, + float momentum, size_t group_rank, size_t group_size, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGather(size_t N_, size_t C_, size_t H_, size_t W_, + int *counts_global, float *means_global, + float *invstds_global, int *counts_local, + float *means_local, float *invstds_local, + float *running_mean_output, float *running_var_output, + half *running_mean_input, half *running_var_input, + float epsilon, float momentum, size_t group_rank, + size_t group_size, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGather(size_t N_, size_t C_, size_t H_, size_t W_, + int *counts_global, float *means_global, + float *invstds_global, int *counts_local, + float *means_local, float *invstds_local, + half *running_mean_output, half *running_var_output, + float *running_mean_input, float *running_var_input, + float epsilon, float momentum, size_t group_rank, + size_t group_size, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormGather(size_t N_, size_t C_, size_t H_, size_t W_, + int *counts_global, float *means_global, + float *invstds_global, int *counts_local, + float *means_local, float *invstds_local, + half *running_mean_output, half *running_var_output, + half *running_mean_input, half *running_var_input, + float epsilon, float momentum, size_t group_rank, + size_t group_size, cudaStream_t cuda_stream); -template void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, const float *input, - float *output, float *means_local, float *invstds_local, float *scale, - float *bias, float *output_scale, float *output_bias, float epsilon, - cudaStream_t cuda_stream); -template void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, const float *input, - float *output, float *means_local, float *invstds_local, half *scale, - half *bias, half *output_scale, half *output_bias, float epsilon, - cudaStream_t cuda_stream); -template void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, const half *input, half *output, - float *means_local, float *invstds_local, float *scale, float *bias, - float *output_scale, float *output_bias, float epsilon, - cudaStream_t cuda_stream); -template void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, const half *input, half *output, - float *means_local, float *invstds_local, half *scale, half *bias, - half *output_scale, half *output_bias, float epsilon, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, + const float *input, float *output, float *means_local, + float *invstds_local, float *scale, float *bias, + float *output_scale, float *output_bias, float epsilon, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, + const float *input, float *output, float *means_local, + float *invstds_local, half *scale, half *bias, + half *output_scale, half *output_bias, float epsilon, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, + const half *input, half *output, float *means_local, + float *invstds_local, float *scale, float *bias, + float *output_scale, float *output_bias, float epsilon, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, + const half *input, half *output, float *means_local, + float *invstds_local, half *scale, half *bias, + half *output_scale, half *output_bias, float epsilon, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cuh new file mode 100644 index 00000000000..4bcf420dbfc --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cuh @@ -0,0 +1,35 @@ +// /** +// * Copyright 2021 Huawei Technologies Co., Ltd +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void CalSyncBatchNormPre(size_t N, size_t C, size_t H, size_t W, const T *input, int *output_n, + float *means_local, float *invstds_local, float epsilon, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalSyncBatchNormGather(size_t N, size_t C, size_t H, size_t W, int *counts_global, + float *means_global, float *invstds_global, int *counts_local, + float *means_local, float *invstds_local, T *running_mean_output, + T *running_var_output, G *running_mean_input, G *running_var_input, + float epsilon, float momentum, size_t group_rank, size_t group_size, + cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, const T *input, T *output, + float *means_local, float *invstds_local, S *scale, S *bias, S *output_scale, + S *output_bias, float epsilon, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SYNC_BATCH_NORM_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cu new file mode 100644 index 00000000000..d5c89fc3c08 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cu @@ -0,0 +1,95 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" + +template +__global__ void TensorScatterAddKernel(T *input, S *indices, T *update, T *output, const size_t block_size, + const size_t input_size, const size_t output_size, const size_t indices_dim_0, + const size_t indices_dim_1, S *indices_stride, S *work_shape) { + int i, j; + for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; + read_index += blockDim.x * gridDim.x) { + size_t write_index = 0; + bool out_bound = false; + + i = read_index / block_size; + j = read_index % block_size; + + for (size_t k = 0; k < indices_dim_1; k++) { + S indices_i = indices[i * indices_dim_1 + k]; + out_bound |= indices_i >= work_shape[k]; + write_index += indices_i * indices_stride[k]; + } + + write_index += j; + out_bound |= write_index >= output_size; + + if (!out_bound) { + MsAtomicAdd(&output[write_index], update[read_index]); + } + } +} + +template +void TensorScatterAdd(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, + S *indices_stride, S *work_shape, cudaStream_t stream) { + TensorScatterAddKernel<<>>( + input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, + work_shape); + return; +} + +template CUDA_LIB_EXPORT void TensorScatterAdd(half *input, int *indices, half *update, half *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterAdd(float *input, int *indices, float *update, float *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterAdd(double *input, int *indices, double *update, double *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterAdd(char *input, int *indices, char *update, char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterAdd(unsigned char *input, int *indices, + unsigned char *update, unsigned char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterAdd(int *input, int *indices, int *update, int *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterAdd(double *input, int64_t *indices, double *update, + double *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh new file mode 100644 index 00000000000..25c84cabe52 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_ADD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_ADD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void TensorScatterAdd(T *input, S *indices, T *update, T *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, S *indices_stride, S *work_shape, + cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_ADD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cu new file mode 100644 index 00000000000..fe2e52bcb68 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cu @@ -0,0 +1,125 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" + +template +__global__ void TensorScatterMaxKernel(T *input, S *indices, T *update, T *output, const size_t block_size, + const size_t input_size, const size_t output_size, const size_t indices_dim_0, + const size_t indices_dim_1, S *indices_stride, S *work_shape) { + int i, j; + for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; + read_index += blockDim.x * gridDim.x) { + size_t write_index = 0; + bool out_bound = false; + + i = read_index / block_size; + j = read_index % block_size; + + for (size_t k = 0; k < indices_dim_1; k++) { + S indices_i = indices[i * indices_dim_1 + k]; + out_bound |= indices_i >= work_shape[k]; + write_index += indices_i * indices_stride[k]; + } + + write_index += j; + out_bound |= write_index >= output_size; + + if (!out_bound) { + MsAtomicMax(&output[write_index], update[read_index]); + } + } +} + +template +void TensorScatterMax(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, + S *indices_stride, S *work_shape, cudaStream_t stream) { + TensorScatterMaxKernel<<>>( + input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, + work_shape); + return; +} + +// for int32 index +template CUDA_LIB_EXPORT void TensorScatterMax(half *input, int *indices, half *update, half *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMax(float *input, int *indices, float *update, float *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMax(char *input, int *indices, char *update, char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMax(unsigned char *input, int *indices, + unsigned char *update, unsigned char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMax(int *input, int *indices, int *update, int *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +// for int64 index +template CUDA_LIB_EXPORT void TensorScatterMax(half *input, int64_t *indices, half *update, half *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMax(float *input, int64_t *indices, float *update, + float *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, const size_t &indices_dim_1, + int64_t *indices_stride, int64_t *work_shape, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMax(char *input, int64_t *indices, char *update, char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMax(unsigned char *input, int64_t *indices, + unsigned char *update, unsigned char *output, + const size_t &block_size, + const size_t &input_size, + const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, + int64_t *indices_stride, int64_t *work_shape, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMax(int *input, int64_t *indices, int *update, int *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh new file mode 100644 index 00000000000..f8cff09de25 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MAX_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MAX_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void TensorScatterMax(T *input, S *indices, T *update, T *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, S *indices_stride, S *work_shape, + cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MAX_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cu new file mode 100644 index 00000000000..222119ba3e9 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cu @@ -0,0 +1,125 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" + +template +__global__ void TensorScatterMinKernel(T *input, S *indices, T *update, T *output, const size_t block_size, + const size_t input_size, const size_t output_size, const size_t indices_dim_0, + const size_t indices_dim_1, S *indices_stride, S *work_shape) { + int i, j; + for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; + read_index += blockDim.x * gridDim.x) { + size_t write_index = 0; + bool out_bound = false; + + i = read_index / block_size; + j = read_index % block_size; + + for (size_t k = 0; k < indices_dim_1; k++) { + S indices_i = indices[i * indices_dim_1 + k]; + out_bound |= indices_i >= work_shape[k]; + write_index += indices_i * indices_stride[k]; + } + + write_index += j; + out_bound |= write_index >= output_size; + + if (!out_bound) { + MsAtomicMin(&output[write_index], update[read_index]); + } + } +} + +template +void TensorScatterMin(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, + S *indices_stride, S *work_shape, cudaStream_t stream) { + TensorScatterMinKernel<<>>( + input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, + work_shape); + return; +} + +// for int32 index +template CUDA_LIB_EXPORT void TensorScatterMin(half *input, int *indices, half *update, half *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMin(float *input, int *indices, float *update, float *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMin(char *input, int *indices, char *update, char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMin(unsigned char *input, int *indices, + unsigned char *update, unsigned char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMin(int *input, int *indices, int *update, int *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +// for int64 index +template CUDA_LIB_EXPORT void TensorScatterMin(half *input, int64_t *indices, half *update, half *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMin(float *input, int64_t *indices, float *update, + float *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, const size_t &indices_dim_1, + int64_t *indices_stride, int64_t *work_shape, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMin(char *input, int64_t *indices, char *update, char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMin(unsigned char *input, int64_t *indices, + unsigned char *update, unsigned char *output, + const size_t &block_size, + const size_t &input_size, + const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, + int64_t *indices_stride, int64_t *work_shape, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterMin(int *input, int64_t *indices, int *update, int *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh new file mode 100644 index 00000000000..0cafc15e10a --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MIN_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MIN_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void TensorScatterMin(T *input, S *indices, T *update, T *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, S *indices_stride, S *work_shape, + cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_MIN_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cu new file mode 100644 index 00000000000..3d78b6db76e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cu @@ -0,0 +1,125 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" + +template +__global__ void TensorScatterSubKernel(T *input, S *indices, T *update, T *output, const size_t block_size, + const size_t input_size, const size_t output_size, const size_t indices_dim_0, + const size_t indices_dim_1, S *indices_stride, S *work_shape) { + int i, j; + for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; + read_index += blockDim.x * gridDim.x) { + size_t write_index = 0; + bool out_bound = false; + + i = read_index / block_size; + j = read_index % block_size; + + for (size_t k = 0; k < indices_dim_1; k++) { + S indices_i = indices[i * indices_dim_1 + k]; + out_bound |= indices_i >= work_shape[k]; + write_index += indices_i * indices_stride[k]; + } + + write_index += j; + out_bound |= write_index >= output_size; + + if (!out_bound) { + MsAtomicSub(&output[write_index], update[read_index]); + } + } +} + +template +void TensorScatterSub(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, + S *indices_stride, S *work_shape, cudaStream_t stream) { + TensorScatterSubKernel<<>>( + input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, + work_shape); + return; +} + +// for int32 index +template CUDA_LIB_EXPORT void TensorScatterSub(half *input, int *indices, half *update, half *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterSub(float *input, int *indices, float *update, float *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterSub(char *input, int *indices, char *update, char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterSub(unsigned char *input, int *indices, + unsigned char *update, unsigned char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterSub(int *input, int *indices, int *update, int *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); + +// for int64 index +template CUDA_LIB_EXPORT void TensorScatterSub(half *input, int64_t *indices, half *update, half *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterSub(float *input, int64_t *indices, float *update, + float *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, const size_t &indices_dim_1, + int64_t *indices_stride, int64_t *work_shape, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterSub(char *input, int64_t *indices, char *update, char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterSub(unsigned char *input, int64_t *indices, + unsigned char *update, unsigned char *output, + const size_t &block_size, + const size_t &input_size, + const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, + int64_t *indices_stride, int64_t *work_shape, + cudaStream_t stream); + +template CUDA_LIB_EXPORT void TensorScatterSub(int *input, int64_t *indices, int *update, int *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh new file mode 100644 index 00000000000..82d85071398 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_SUB_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_SUB_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void TensorScatterSub(T *input, S *indices, T *update, T *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, S *indices_stride, S *work_shape, + cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_SUB_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cu new file mode 100644 index 00000000000..78f40034b37 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cu @@ -0,0 +1,114 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" + +template +__global__ void TensorScatterUpdateKernel(T *input, S *indices, T *update, T *output, const size_t block_size, + const size_t input_size, const size_t output_size, const size_t indices_dim_0, + const size_t indices_dim_1, S *indices_stride, S *work_shape) { + int i, j; + for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; + read_index += blockDim.x * gridDim.x) { + size_t write_index = 0; + bool out_bound = false; + + i = read_index / block_size; + j = read_index % block_size; + + for (size_t k = 0; k < indices_dim_1; k++) { + S indices_i = indices[i * indices_dim_1 + k]; + out_bound |= indices_i >= work_shape[k]; + write_index += indices_i * indices_stride[k]; + } + + write_index += j; + out_bound |= write_index >= output_size; + + if (!out_bound) { + output[write_index] = update[read_index]; + } + } +} + +template +void TensorScatterUpdate(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, + S *indices_stride, S *work_shape, cudaStream_t stream) { + TensorScatterUpdateKernel<<>>( + input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, + work_shape); + return; +} + +template CUDA_LIB_EXPORT void TensorScatterUpdate(half *input, int *indices, half *update, half *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterUpdate(float *input, int *indices, float *update, float *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterUpdate(double *input, int *indices, double *update, + double *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, const size_t &indices_dim_1, + int *indices_stride, int *work_shape, + cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterUpdate(char *input, int *indices, char *update, char *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterUpdate(unsigned char *input, int *indices, + unsigned char *update, unsigned char *output, + const size_t &block_size, + const size_t &input_size, + const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterUpdate(int *input, int *indices, int *update, int *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterUpdate(bool *input, int *indices, bool *update, bool *output, + const size_t &block_size, const size_t &input_size, + const size_t &output_size, const size_t &indices_dim_0, + const size_t &indices_dim_1, int *indices_stride, + int *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterUpdate(bool *input, int64_t *indices, bool *update, + bool *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterUpdate(float *input, int64_t *indices, float *update, + float *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); +template CUDA_LIB_EXPORT void TensorScatterUpdate(double *input, int64_t *indices, double *update, + double *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, + const size_t &indices_dim_1, int64_t *indices_stride, + int64_t *work_shape, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh new file mode 100644 index 00000000000..4c216307d30 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_UPDATE_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_UPDATE_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void TensorScatterUpdate(T *input, S *indices, T *update, T *output, const size_t &block_size, + const size_t &input_size, const size_t &output_size, + const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride, + S *work_shape, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TENSOR_SCATTER_UPDATE_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cu new file mode 100644 index 00000000000..25c84ca0027 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cu @@ -0,0 +1,81 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh" +#include "include/cuda_fp16.h" + +template +__global__ void Tile(const size_t output_size, const size_t input_size, const size_t shape_size, + const size_t *input_shape, const size_t *output_shape, const T *input, T *output) { + // for example 4-D: pos = pos_array[0] * output_shape[1] * output_shape[2] * output_shape[3] + + // pos_array[1] * output_shape[2] * output_shape[3] + + // pos_array[2] * output_shape[3] + + // pos_array[3] + size_t pos_array[TILE_MAX_DIMENSION]; + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_size; pos += blockDim.x * gridDim.x) { + size_t tmp_pos = pos; + size_t pos_size = output_size / output_shape[0]; + pos_array[0] = tmp_pos / pos_size; + for (size_t i = 1; i < shape_size; i++) { + tmp_pos -= pos_array[i - 1] * pos_size; + pos_size = pos_size / output_shape[i]; + pos_array[i] = tmp_pos / pos_size; + } + for (size_t i = 0; i < shape_size; i++) { + pos_array[i] = pos_array[i] % input_shape[i]; + } + pos_size = input_size; + size_t input_pos = 0; + for (size_t i = 0; i < shape_size; i++) { + pos_size /= input_shape[i]; + input_pos += (pos_array[i] * pos_size); + } + output[pos] = input[input_pos]; + } +} + +template +void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, const size_t *input_shape, + const size_t *output_shape, const T *input, T *output, cudaStream_t cuda_stream) { + Tile<<>>(output_size, input_size, shape_size, input_shape, + output_shape, input, output); + return; +} + +template CUDA_LIB_EXPORT void CalTile(const size_t output_size, const size_t input_size, + const size_t shape_size, const size_t *input_shape, + const size_t *output_shape, const double *input, double *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, + const size_t *input_shape, const size_t *output_shape, const float *input, + float *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, + const size_t *input_shape, const size_t *output_shape, const half *input, + half *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTile(const size_t output_size, const size_t input_size, + const size_t shape_size, const size_t *input_shape, + const size_t *output_shape, const int16_t *input, int16_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, + const size_t *input_shape, const size_t *output_shape, const int *input, + int *output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTile(const size_t output_size, const size_t input_size, + const size_t shape_size, const size_t *input_shape, + const size_t *output_shape, const int64_t *input, int64_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, + const size_t *input_shape, const size_t *output_shape, const bool *input, + bool *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh index 3709254aaf5..6816f0e21d4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh @@ -14,14 +14,14 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_APPLY_GRADIENT_DESCENT_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_APPLY_GRADIENT_DESCENT_IMPL_CUH_ - -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TILE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TILE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#define TILE_MAX_DIMENSION 100 template -void CalApplyGradientDescent(const size_t &size, T *var, const T *alpha, const T *delta, T *output, +CUDA_LIB_EXPORT void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, + const size_t *input_shape, const size_t *output_shape, const T *input, T *output, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_APPLY_GRADIENT_DESCENT_IMPL_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TILE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cu similarity index 93% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cu index 6e21b3918fb..e56af86ba13 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cu @@ -14,10 +14,11 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/topk_lib.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_lib.cuh" #include #include +#include "include/cuda_fp16.h" const int kMaxQueue = 128; @@ -221,7 +222,7 @@ void FastTopK(const int outer_size, const int inner_size, const T *input, S k_cu } } -template void FastTopK(const int outer_size, const int inner_size, const half *input, int k_cut, half *output, - int *output_index, const half init_K, cudaStream_t stream); -template void FastTopK(const int outer_size, const int inner_size, const float *input, int k_cut, float *output, - int *output_index, const float init_K, cudaStream_t stream); +template CUDA_LIB_EXPORT void FastTopK(const int outer_size, const int inner_size, const half *input, int k_cut, + half *output, int *output_index, const half init_K, cudaStream_t stream); +template CUDA_LIB_EXPORT void FastTopK(const int outer_size, const int inner_size, const float *input, int k_cut, + float *output, int *output_index, const float init_K, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh similarity index 57% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh index a1ca24d09cb..894beb8196b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh @@ -14,14 +14,13 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TOPK_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TOPK_IMPL_CUH_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TOPK_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TOPK_IMPL_CUH_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void FastTopK(const int outer, const int inner, const T *input_addr, S k_cut, T *output, S *indices, const T initK, - cudaStream_t stream); +CUDA_LIB_EXPORT void FastTopK(const int outer, const int inner, const T *input_addr, S k_cut, T *output, S *indices, + const T initK, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TOPK_IMPL_CUH_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TOPK_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_lib.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_lib.cuh similarity index 100% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/topk_lib.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_lib.cuh diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cu new file mode 100755 index 00000000000..a2de4534940 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cu @@ -0,0 +1,87 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "include/cuda_fp16.h" +#include "transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" + +template +using Complex = mindspore::utils::Complex; + +template +__global__ void Transpose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis, + const size_t shape_size, T *output) { + size_t pos_size; + size_t temp_pos; + size_t newpos; + size_t newpos_size; + size_t pos_array[TRANSPOSE_MAX_DIMENSION]; + + // for example 4-D: pos = posArray[0] * input_shape[1] * input_shape[2] * input_shape[3] + + // posArray[1] * input_shape[2] * input_shape[3] + + // posArray[2] * input_shape[3] + + // posArray[3] + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { + temp_pos = pos; + pos_size = size / input_shape[0]; + pos_array[0] = temp_pos / pos_size; + for (size_t i = 1; i < shape_size; i++) { + temp_pos -= pos_array[i - 1] * pos_size; + pos_size = pos_size / input_shape[i]; + pos_array[i] = temp_pos / pos_size; + } + + newpos = pos_array[input_axis[shape_size - 1]]; + newpos_size = 1; + for (int64_t j = shape_size - 2; j >= 0; j--) { + newpos_size *= input_shape[input_axis[j + 1]]; + newpos += pos_array[input_axis[j]] * newpos_size; + } + + output[newpos] = input[pos]; + } +} +template +void CalTranspose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis, + const size_t shape_size, T *output, cudaStream_t cuda_stream) { + Transpose<<>>(size, input, input_shape, input_axis, shape_size, + output); +} + +template CUDA_LIB_EXPORT void CalTranspose(const size_t size, const double *input, const size_t *input_shape, + const size_t *input_axis, const size_t shape_size, double *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTranspose(const size_t size, const float *input, const size_t *input_shape, + const size_t *input_axis, const size_t shape_size, float *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTranspose(const size_t size, const half *input, const size_t *input_shape, + const size_t *input_axis, const size_t shape_size, half *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTranspose(const size_t size, const int *input, const size_t *input_shape, + const size_t *input_axis, const size_t shape_size, int *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTranspose(const size_t size, const int64_t *input, const size_t *input_shape, + const size_t *input_axis, const size_t shape_size, int64_t *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTranspose>(const size_t size, const Complex *input, + const size_t *input_shape, const size_t *input_axis, + const size_t shape_size, Complex *output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalTranspose>(const size_t size, const Complex *input, + const size_t *input_shape, const size_t *input_axis, + const size_t shape_size, Complex *output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh index c1aec6e0486..8c1d9360229 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh @@ -14,12 +14,13 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_H_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #define TRANSPOSE_MAX_DIMENSION 100 template -void CalTranspose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis, - const size_t shape_size, T *output, cudaStream_t cuda_stream); +CUDA_LIB_EXPORT void CalTranspose(const size_t size, const T *input, const size_t *input_shape, + const size_t *input_axis, const size_t shape_size, T *output, + cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cu similarity index 71% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cu index fd0b1b6203a..2c303bcbc38 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cu @@ -21,7 +21,7 @@ #include #include #include "transpose_impl_opt.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "include/cuda_fp16.h" // Optimize nchw2nhwc && nhwc2nchw with tiling and shared memory. // Firstly, combined 2 dims hw together, treat input and output as 3D tensor. @@ -255,44 +255,52 @@ void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const T * d_output, cuda_stream); } -template void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, const double *d_input, - const size_t *input_shape, const size_t *input_axis, - const size_t *d_input_shape, const size_t *d_input_axis, double *d_output, - cudaStream_t cuda_stream); -template void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, const float *d_input, - const size_t *input_shape, const size_t *input_axis, - const size_t *d_input_shape, const size_t *d_input_axis, float *d_output, - cudaStream_t cuda_stream); -template void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, const half *d_input, - const size_t *input_shape, const size_t *input_axis, - const size_t *d_input_shape, const size_t *d_input_axis, half *d_output, - cudaStream_t cuda_stream); -template void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, const int *d_input, - const size_t *input_shape, const size_t *input_axis, - const size_t *d_input_shape, const size_t *d_input_axis, int *d_output, - cudaStream_t cuda_stream); -template void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, const int64_t *d_input, - const size_t *input_shape, const size_t *input_axis, - const size_t *d_input_shape, const size_t *d_input_axis, int64_t *d_output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, + const double *d_input, const size_t *input_shape, + const size_t *input_axis, const size_t *d_input_shape, + const size_t *d_input_axis, double *d_output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, + const float *d_input, const size_t *input_shape, + const size_t *input_axis, const size_t *d_input_shape, + const size_t *d_input_axis, float *d_output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, + const half *d_input, const size_t *input_shape, + const size_t *input_axis, const size_t *d_input_shape, + const size_t *d_input_axis, half *d_output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, const int *d_input, + const size_t *input_shape, const size_t *input_axis, + const size_t *d_input_shape, const size_t *d_input_axis, + int *d_output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, + const int64_t *d_input, const size_t *input_shape, + const size_t *input_axis, const size_t *d_input_shape, + const size_t *d_input_axis, int64_t *d_output, + cudaStream_t cuda_stream); -template void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const double *d_input, - const size_t *input_shape, const size_t *input_axis, - const size_t *d_input_shape, const size_t *d_input_axis, double *d_output, - cudaStream_t cuda_stream); -template void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const float *d_input, - const size_t *input_shape, const size_t *input_axis, - const size_t *d_input_shape, const size_t *d_input_axis, float *d_output, - cudaStream_t cuda_stream); -template void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const half *d_input, - const size_t *input_shape, const size_t *input_axis, - const size_t *d_input_shape, const size_t *d_input_axis, half *d_output, - cudaStream_t cuda_stream); -template void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const int *d_input, - const size_t *input_shape, const size_t *input_axis, - const size_t *d_input_shape, const size_t *d_input_axis, int *d_output, - cudaStream_t cuda_stream); -template void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const int64_t *d_input, - const size_t *input_shape, const size_t *input_axis, - const size_t *d_input_shape, const size_t *d_input_axis, int64_t *d_output, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, + const double *d_input, const size_t *input_shape, + const size_t *input_axis, const size_t *d_input_shape, + const size_t *d_input_axis, double *d_output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, + const float *d_input, const size_t *input_shape, + const size_t *input_axis, const size_t *d_input_shape, + const size_t *d_input_axis, float *d_output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, + const half *d_input, const size_t *input_shape, + const size_t *input_axis, const size_t *d_input_shape, + const size_t *d_input_axis, half *d_output, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const int *d_input, + const size_t *input_shape, const size_t *input_axis, + const size_t *d_input_shape, const size_t *d_input_axis, + int *d_output, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, + const int64_t *d_input, const size_t *input_shape, + const size_t *input_axis, const size_t *d_input_shape, + const size_t *d_input_axis, int64_t *d_output, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh new file mode 100644 index 00000000000..87efa082f45 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh @@ -0,0 +1,34 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_OPT_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_OPT_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#define TRANSPOSE_MAX_DIMENSION 100 +template +CUDA_LIB_EXPORT void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, const T *d_input, + const size_t *input_shape, const size_t *input_axis, + const size_t *d_input_shape, const size_t *d_input_axis, T *output, + cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const T *d_input, + const size_t *input_shape, const size_t *input_axis, + const size_t *d_input_shape, const size_t *d_input_axis, T *output, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRANSPOSE_IMPL_OPT_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cu similarity index 64% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cu index 593fddfbaff..a36cfb788ab 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cu @@ -15,6 +15,7 @@ */ #include "triangle_matrix_copy_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void TriangleMatrixCopyKernel(const T *input, T *output, bool clean, cublasFillMode_t uplo, const size_t count, const size_t ldb, const size_t m) { @@ -57,14 +58,16 @@ void TriangleMatrixCopy(const T *input, T *output, bool clean, cublasFillMode_t return; } -template void TriangleMatrixCopy(const float *input, float *output, bool clean, cublasFillMode_t uplo, - const size_t count, const size_t ldb, const size_t m, cudaStream_t cuda_stream); -template void TriangleMatrixCopy(const half *input, half *output, bool clean, cublasFillMode_t uplo, - const size_t count, const size_t ldb, const size_t m, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void TriangleMatrixCopy(const float *input, float *output, bool clean, + cublasFillMode_t uplo, const size_t count, const size_t ldb, + const size_t m, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void TriangleMatrixCopy(const half *input, half *output, bool clean, + cublasFillMode_t uplo, const size_t count, const size_t ldb, + const size_t m, cudaStream_t cuda_stream); -template void TriangleMatrixCopy(const double *input, double *output, bool clean, cublasFillMode_t uplo, - const size_t count, const size_t ldb, const size_t m, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void TriangleMatrixCopy(const double *input, double *output, bool clean, + cublasFillMode_t uplo, const size_t count, const size_t ldb, + const size_t m, cudaStream_t cuda_stream); template void MatrixCopy(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { @@ -72,6 +75,9 @@ void MatrixCopy(const T *input, T *output, const size_t count, cudaStream_t cuda return; } -template void MatrixCopy(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void MatrixCopy(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void MatrixCopy(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixCopy(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixCopy(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void MatrixCopy(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh new file mode 100644 index 00000000000..838bbc0a90d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRIANGLE_MATRIX_COPY_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRIANGLE_MATRIX_COPY_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void TriangleMatrixCopy(const T *input, T *output, bool clean, cublasFillMode_t uplo, + const size_t count, const size_t ldb, const size_t m, cudaStream_t cuda_stream); + +template +CUDA_LIB_EXPORT void MatrixCopy(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_TRIANGLE_MATRIX_COPY_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cu similarity index 57% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cu index 4beb1e0c58a..7e51f0f9b19 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cu @@ -15,6 +15,7 @@ */ #include "unary_op_grad_impl.cuh" +#include "include/cuda_fp16.h" template __global__ void SqrtGradKernel(const T *input, const T *dout, T *output, const size_t count) { @@ -170,52 +171,52 @@ void ReciprocalGrad(const T *input, const T *dout, T *output, const size_t count return; } -template void SqrtGrad(const double *input, const double *dout, double *output, const size_t count, - cudaStream_t cuda_stream); -template void RsqrtGrad(const double *input, const double *dout, double *output, const size_t count, - cudaStream_t cuda_stream); -template void AsinGrad(const double *input, const double *dout, double *output, const size_t count, - cudaStream_t cuda_stream); -template void ACosGrad(const double *input, const double *dout, double *output, const size_t count, - cudaStream_t cuda_stream); -template void AtanGrad(const double *input, const double *dout, double *output, const size_t count, - cudaStream_t cuda_stream); -template void AsinhGrad(const double *input, const double *dout, double *output, const size_t count, - cudaStream_t cuda_stream); -template void AcoshGrad(const double *input, const double *dout, double *output, const size_t count, - cudaStream_t cuda_stream); -template void ReciprocalGrad(const double *input, const double *dout, double *output, const size_t count, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SqrtGrad(const double *input, const double *dout, double *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void RsqrtGrad(const double *input, const double *dout, double *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AsinGrad(const double *input, const double *dout, double *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ACosGrad(const double *input, const double *dout, double *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AtanGrad(const double *input, const double *dout, double *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AsinhGrad(const double *input, const double *dout, double *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AcoshGrad(const double *input, const double *dout, double *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReciprocalGrad(const double *input, const double *dout, double *output, + const size_t count, cudaStream_t cuda_stream); -template void SqrtGrad(const float *input, const float *dout, float *output, const size_t count, - cudaStream_t cuda_stream); -template void RsqrtGrad(const float *input, const float *dout, float *output, const size_t count, - cudaStream_t cuda_stream); -template void AsinGrad(const float *input, const float *dout, float *output, const size_t count, - cudaStream_t cuda_stream); -template void ACosGrad(const float *input, const float *dout, float *output, const size_t count, - cudaStream_t cuda_stream); -template void AtanGrad(const float *input, const float *dout, float *output, const size_t count, - cudaStream_t cuda_stream); -template void AsinhGrad(const float *input, const float *dout, float *output, const size_t count, - cudaStream_t cuda_stream); -template void AcoshGrad(const float *input, const float *dout, float *output, const size_t count, - cudaStream_t cuda_stream); -template void ReciprocalGrad(const float *input, const float *dout, float *output, const size_t count, - cudaStream_t cuda_stream); -template void SqrtGrad(const half *input, const half *dout, half *output, const size_t count, - cudaStream_t cuda_stream); -template void RsqrtGrad(const half *input, const half *dout, half *output, const size_t count, - cudaStream_t cuda_stream); -template void AsinGrad(const half *input, const half *dout, half *output, const size_t count, - cudaStream_t cuda_stream); -template void ACosGrad(const half *input, const half *dout, half *output, const size_t count, - cudaStream_t cuda_stream); -template void AtanGrad(const half *input, const half *dout, half *output, const size_t count, - cudaStream_t cuda_stream); -template void AsinhGrad(const half *input, const half *dout, half *output, const size_t count, - cudaStream_t cuda_stream); -template void AcoshGrad(const half *input, const half *dout, half *output, const size_t count, - cudaStream_t cuda_stream); -template void ReciprocalGrad(const half *input, const half *dout, half *output, const size_t count, - cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SqrtGrad(const float *input, const float *dout, float *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void RsqrtGrad(const float *input, const float *dout, float *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AsinGrad(const float *input, const float *dout, float *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ACosGrad(const float *input, const float *dout, float *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AtanGrad(const float *input, const float *dout, float *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AsinhGrad(const float *input, const float *dout, float *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AcoshGrad(const float *input, const float *dout, float *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReciprocalGrad(const float *input, const float *dout, float *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void SqrtGrad(const half *input, const half *dout, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void RsqrtGrad(const half *input, const half *dout, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AsinGrad(const half *input, const half *dout, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ACosGrad(const half *input, const half *dout, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AtanGrad(const half *input, const half *dout, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AsinhGrad(const half *input, const half *dout, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void AcoshGrad(const half *input, const half *dout, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ReciprocalGrad(const half *input, const half *dout, half *output, + const size_t count, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cuh new file mode 100755 index 00000000000..bac30717450 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cuh @@ -0,0 +1,38 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_GRAD_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_GRAD_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT void SqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void RsqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void AsinGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void ACosGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void AtanGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void AsinhGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void AcoshGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void ReciprocalGrad(const T *input, const T *dout, T *output, const size_t count, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cu new file mode 100755 index 00000000000..ee324347eba --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cu @@ -0,0 +1,920 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "unary_op_impl.cuh" +#include "include/cuda_fp16.h" +template +__global__ void ExponentialKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = expf(input[i]); + } + return; +} +template <> +__global__ void ExponentialKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = exp(input[i]); + } + return; +} +template <> +__global__ void ExponentialKernel(const half *input, half *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = hexp(input[i]); + } + return; +} +template +__global__ void Expm1Kernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = expm1f(input[i]); + } + return; +} +template <> +__global__ void Expm1Kernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = expm1(input[i]); + } + return; +} +template +__global__ void LogarithmKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = logf(input[i]); + } + return; +} +template <> +__global__ void LogarithmKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = log(input[i]); + } + return; +} +template <> +__global__ void LogarithmKernel(const half *input, half *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = hlog(input[i]); + } + return; +} +template +__global__ void Log1pKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = log1pf(input[i]); + } + return; +} +template <> +__global__ void Log1pKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = log1p(input[i]); + } + return; +} +template +__global__ void ErfKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = erff(input[i]); + } + return; +} +template <> +__global__ void ErfKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = erf(input[i]); + } + return; +} +template +__global__ void ErfcKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = erfcf(input[i]); + } + return; +} +template <> +__global__ void ErfcKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = erfc(input[i]); + } + return; +} +template +__global__ void NegativeKernel(const T *input, T *output, const size_t count) { + T neg_one = -1; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = neg_one * input[i]; + } + return; +} +template +__global__ void ReciprocalKernel(const T *input, T *output, const size_t count) { + T one = 1.0; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = one / input[i]; + } + return; +} +template +__global__ void SquareKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = input[i] * input[i]; + } + return; +} +template +__global__ void SqrtKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = sqrtf(input[i]); + } + return; +} +template <> +__global__ void SqrtKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = sqrt(input[i]); + } + return; +} +template <> +__global__ void SqrtKernel(const half *input, half *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = hsqrt(input[i]); + } + return; +} +template +__global__ void RsqrtKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = rsqrtf(input[i]); + } + return; +} +template <> +__global__ void RsqrtKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = rsqrt(input[i]); + } + return; +} +template <> +__global__ void RsqrtKernel(const half *input, half *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = hrsqrt(input[i]); + } + return; +} +template +__global__ void SinKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = sinf(input[i]); + } + return; +} +template <> +__global__ void SinKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = sin(input[i]); + } + return; +} +template <> +__global__ void SinKernel(const half *input, half *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = hsin(input[i]); + } + return; +} +template +__global__ void AsinKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = asinf(input[i]); + } + return; +} +template <> +__global__ void AsinKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = asin(input[i]); + } + return; +} +template +__global__ void AsinhKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = asinhf(input[i]); + } + return; +} +template <> +__global__ void AsinhKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = asinh(input[i]); + } + return; +} +template +__global__ void CosKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = cosf(input[i]); + } + return; +} +template <> +__global__ void CosKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = cos(input[i]); + } + return; +} +template <> +__global__ void CosKernel(const half *input, half *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = hcos(input[i]); + } + return; +} +template +__global__ void ACosKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = acosf(input[i]); + } + return; +} +template <> +__global__ void ACosKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = acos(input[i]); + } + return; +} +template +__global__ void AcoshKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = acoshf(input[i]); + } + return; +} +template <> +__global__ void AcoshKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = acosh(input[i]); + } + return; +} +template +__global__ void AtanKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = atanf(input[i]); + } + return; +} +template <> +__global__ void AtanKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = atan(input[i]); + } + return; +} +template +__global__ void AbsKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = abs(input[i]); + } + return; +} +template <> +__global__ void AbsKernel(const half *input, half *output, const size_t count) { + half zero = 0.0; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = input[i] < zero ? -input[i] : input[i]; + } + return; +} +template +__global__ void AbsKernel(const Complex *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = abs(input[i]); + } + return; +} +template +__global__ void RealKernel(const Complex *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = input[i].real(); + } + return; +} +template +__global__ void RealKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = input[i]; + } + return; +} +template +__global__ void ImagKernel(const Complex *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = input[i].imag(); + } + return; +} +template +__global__ void ImagKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + T zero = 0; + output[i] = zero; + } + return; +} +template +__global__ void ConjKernel(const Complex *input, Complex *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = Complex(input[i].real(), -input[i].imag()); + } + return; +} +template +__global__ void ConjKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = input[i]; + } + return; +} +template +__global__ void FloorKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = floorf(input[i]); + } + return; +} +template <> +__global__ void FloorKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = floor(input[i]); + } + return; +} +template <> +__global__ void FloorKernel(const half *input, half *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = hfloor(input[i]); + } + return; +} +template +__global__ void RintKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = rintf(input[i]); + } + return; +} +template <> +__global__ void RintKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = rint(input[i]); + } + return; +} +template <> +__global__ void RintKernel(const half *input, half *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = hrint(input[i]); + } + return; +} +template +__global__ void RoundKernel(const T *input, T *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = nearbyintf(input[i]); + } + return; +} +template <> +__global__ void RoundKernel(const double *input, double *output, const size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = nearbyint(input[i]); + } + return; +} +template +__global__ void SignKernel(const T *input, T *output, const size_t count) { + T zero = 0.0; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + T res; + if (input[i] < zero) { + res = -1; + } else if (input[i] > zero) { + res = 1; + } else { + res = 0; + } + output[i] = static_cast(res); + } + return; +} +template +void Exponential(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + ExponentialKernel<<>>(input, output, count); + return; +} +template +void Expm1(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + Expm1Kernel<<>>(input, output, count); + return; +} +template +void Logarithm(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + LogarithmKernel<<>>(input, output, count); + return; +} +template +void Log1p(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + Log1pKernel<<>>(input, output, count); + return; +} +template +void Erf(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + ErfKernel<<>>(input, output, count); + return; +} +template +void Erfc(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + ErfcKernel<<>>(input, output, count); + return; +} +template +void Negative(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + NegativeKernel<<>>(input, output, count); + return; +} +template +void Reciprocal(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + ReciprocalKernel<<>>(input, output, count); + return; +} +template +void Square(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + SquareKernel<<>>(input, output, count); + return; +} +template +void Pow(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + PowKernel<<>>(input, output, count); + return; +} +template +void Sqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + SqrtKernel<<>>(input, output, count); + return; +} +template +void Sin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + SinKernel<<>>(input, output, count); + return; +} +template +void Cos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + CosKernel<<>>(input, output, count); + return; +} +template +void Asin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + AsinKernel<<>>(input, output, count); + return; +} +template +void ACos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + ACosKernel<<>>(input, output, count); + return; +} +template +void Atan(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + AtanKernel<<>>(input, output, count); + return; +} +template +void Asinh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + AsinhKernel<<>>(input, output, count); + return; +} +template +void Acosh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + AcoshKernel<<>>(input, output, count); + return; +} +template +void Rsqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + RsqrtKernel<<>>(input, output, count); + return; +} +template +void Abs(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + AbsKernel<<>>(input, output, count); + return; +} +template +void Abs(const Complex *input, T *output, const size_t count, cudaStream_t cuda_stream) { + AbsKernel<<>>(input, output, count); + return; +} +template +void Real(const Complex *input, T *output, const size_t count, cudaStream_t cuda_stream) { + RealKernel<<>>(input, output, count); + return; +} +template +void Real(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + RealKernel<<>>(input, output, count); + return; +} +template +void Imag(const Complex *input, T *output, const size_t count, cudaStream_t cuda_stream) { + ImagKernel<<>>(input, output, count); + return; +} +template +void Imag(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + ImagKernel<<>>(input, output, count); + return; +} +template +void Conj(const Complex *input, Complex *output, const size_t count, cudaStream_t cuda_stream) { + ConjKernel<<>>(input, output, count); + return; +} +template +void Conj(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + ConjKernel<<>>(input, output, count); + return; +} +template +void Floor(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + FloorKernel<<>>(input, output, count); + return; +} +template +void Rint(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + RintKernel<<>>(input, output, count); + return; +} +template +void Round(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + RoundKernel<<>>(input, output, count); + return; +} +template +void Sign(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { + SignKernel<<>>(input, output, count); + return; +} + +// double +template CUDA_LIB_EXPORT void Exponential(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Expm1(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Logarithm(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Log1p(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erf(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erfc(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Negative(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Reciprocal(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Square(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sqrt(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sin(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Cos(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asin(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ACos(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Atan(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asinh(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Acosh(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rsqrt(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Abs(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Floor(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rint(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Round(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sign(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Real(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const double *input, double *output, const size_t count, + cudaStream_t cuda_stream); + + +// float +template CUDA_LIB_EXPORT void Exponential(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Expm1(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Logarithm(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Log1p(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erf(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erfc(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Negative(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Reciprocal(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Square(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sqrt(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sin(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Cos(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asin(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ACos(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Atan(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asinh(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Acosh(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rsqrt(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Abs(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Floor(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rint(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Round(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sign(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Real(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const float *input, float *output, const size_t count, + cudaStream_t cuda_stream); + +// half +template CUDA_LIB_EXPORT void Exponential(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Expm1(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Logarithm(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Log1p(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erf(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erfc(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Negative(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Reciprocal(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Square(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sqrt(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sin(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Cos(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asin(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ACos(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Atan(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asinh(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Acosh(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rsqrt(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Abs(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Floor(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rint(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Round(const half *input, half *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sign(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Real(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); + +// int8 +template CUDA_LIB_EXPORT void Exponential(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Expm1(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Logarithm(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Log1p(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erf(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erfc(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Negative(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Reciprocal(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Square(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sqrt(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sin(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Cos(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asin(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ACos(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Atan(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asinh(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Acosh(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rsqrt(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Abs(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Floor(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rint(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Round(const char *input, char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sign(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Real(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); + +// uint8 +template CUDA_LIB_EXPORT void Exponential(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Expm1(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Logarithm(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Log1p(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erf(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erfc(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Negative(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Reciprocal(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Square(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sqrt(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sin(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Cos(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asin(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ACos(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Atan(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asinh(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Acosh(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rsqrt(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Abs(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Floor(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rint(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Round(const unsigned char *input, unsigned char *output, + const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sign(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Real(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const unsigned char *input, unsigned char *output, const size_t count, + cudaStream_t cuda_stream); + +// int32 +template CUDA_LIB_EXPORT void Exponential(const int *input, int *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Expm1(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Logarithm(const int *input, int *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Log1p(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erf(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Erfc(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Negative(const int *input, int *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Reciprocal(const int *input, int *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Square(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sqrt(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sin(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Cos(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asin(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void ACos(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Atan(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Asinh(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Acosh(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rsqrt(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Abs(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Floor(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Rint(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Round(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Sign(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Real(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); + +// complex64 +template CUDA_LIB_EXPORT void Real(const Complex *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const Complex *input, float *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const Complex *input, Complex *output, const size_t count, + cudaStream_t cuda_stream); + +// complex128 +template CUDA_LIB_EXPORT void Real(const Complex *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const Complex *input, double *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const Complex *input, Complex *output, const size_t count, + cudaStream_t cuda_stream); + +// bool +template CUDA_LIB_EXPORT void Real(const bool *input, bool *output, const size_t count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const bool *input, bool *output, const size_t count, cudaStream_t cuda_stream); + +// int16 +template CUDA_LIB_EXPORT void Real(const int16_t *input, int16_t *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const int16_t *input, int16_t *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const int16_t *input, int16_t *output, const size_t count, + cudaStream_t cuda_stream); + +// uint16 +template CUDA_LIB_EXPORT void Real(const uint16_t *input, uint16_t *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const uint16_t *input, uint16_t *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const uint16_t *input, uint16_t *output, const size_t count, + cudaStream_t cuda_stream); + +// uint32 +template CUDA_LIB_EXPORT void Real(const uint32_t *input, uint32_t *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const uint32_t *input, uint32_t *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const uint32_t *input, uint32_t *output, const size_t count, + cudaStream_t cuda_stream); + +// int64 +template CUDA_LIB_EXPORT void Real(const int64_t *input, int64_t *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const int64_t *input, int64_t *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const int64_t *input, int64_t *output, const size_t count, + cudaStream_t cuda_stream); + +// uint64 +template CUDA_LIB_EXPORT void Real(const uint64_t *input, uint64_t *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Imag(const uint64_t *input, uint64_t *output, const size_t count, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void Conj(const uint64_t *input, uint64_t *output, const size_t count, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh new file mode 100755 index 00000000000..1ff160f77a7 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh @@ -0,0 +1,79 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" +template +CUDA_LIB_EXPORT void Exponential(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Expm1(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Logarithm(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Log1p(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Erf(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Erfc(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Negative(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Reciprocal(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Square(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Sqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Rsqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Sin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Cos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Asin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void ACos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Atan(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Asinh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Acosh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Abs(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Floor(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Rint(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Round(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Sign(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Real(const Complex *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Real(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Imag(const Complex *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Imag(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Conj(const Complex *input, Complex *output, const size_t count, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void Conj(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNARY_OP_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cu similarity index 73% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cu index d020a4fc9fd..be203aeb085 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cuh" template __global__ void AssignToOutput(const int64_t size, const S prob_val, S *output_array) { @@ -31,6 +31,7 @@ void CalUniformCandidateSampler(const int64_t true_size, const int64_t num_sampl sampled_expected_count); } -template void CalUniformCandidateSampler(const int64_t true_size, const int64_t num_sampled, - const float prob_val, float *true_expected_count, - float *sampled_expected_count, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void CalUniformCandidateSampler(const int64_t true_size, const int64_t num_sampled, + const float prob_val, float *true_expected_count, + float *sampled_expected_count, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cuh new file mode 100644 index 00000000000..575c8258833 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cuh @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_ +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" + +template +CUDA_LIB_EXPORT void CalUniformCandidateSampler(const int64_t true_size, const int64_t num_sampled, const S prob_val, + S *true_expected_count, S *sampled_expected_count, + cudaStream_t cuda_stream); + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cu similarity index 73% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cu index 1ecffbc9b06..e604fc3d8ad 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cu @@ -23,7 +23,6 @@ #include #include #include "unique_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" #include "include/cuda_fp16.h" template @@ -66,11 +65,14 @@ int CalUnique(const T *input, int num_elements, S *input_index, S *sorted_index, return output_size; } -template int CalUnique(const float *input, int num_elements, int *input_index, int *sorted_index, - float *output, int *index, cudaStream_t cuda_stream); -template int CalUnique(const half *input, int num_elements, int *input_index, int *sorted_index, - half *output, int *index, cudaStream_t cuda_stream); -template int CalUnique(const int *input, int num_elements, int *input_index, int *sorted_index, - int *output, int *index, cudaStream_t cuda_stream); -template int CalUnique(const int64_t *input, int num_elements, int64_t *input_index, - int64_t *sorted_index, int64_t *output, int64_t *index, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT int CalUnique(const float *input, int num_elements, int *input_index, + int *sorted_index, float *output, int *index, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT int CalUnique(const half *input, int num_elements, int *input_index, + int *sorted_index, half *output, int *index, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT int CalUnique(const int *input, int num_elements, int *input_index, + int *sorted_index, int *output, int *index, cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT int CalUnique(const int64_t *input, int num_elements, int64_t *input_index, + int64_t *sorted_index, int64_t *output, int64_t *index, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh similarity index 57% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh index 2ae09fc58dd..f1cf917f079 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh @@ -14,12 +14,10 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SQUARE_SUM_ALL_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SQUARE_SUM_ALL_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void SquareSumAll(const size_t input_size_, const T* input_addr_0, const T* input_addr_1, - T* output_addr_0, T* output_addr_1, float* ws_addr_0, float* ws_addr_1, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SQUARE_SUM_ALL_IMPL_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIQUE_IMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIQUE_IMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +template +CUDA_LIB_EXPORT int CalUnique(const T *input, int num_elements, S *input_index, S *sorted_index, T *output, S *index, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNIQUE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cu new file mode 100755 index 00000000000..9cd1baa03d6 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cu @@ -0,0 +1,75 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh" +#include "include/cuda_fp16.h" +template +__global__ void Unpack(const size_t size, const size_t output_num, + const size_t dims_after_axis, T** outputs, const T* input) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { + size_t cur_input_index = pos / dims_after_axis % output_num; + size_t cycle_len = output_num * dims_after_axis; + size_t local_index = pos / cycle_len * dims_after_axis + pos % cycle_len % dims_after_axis; + outputs[cur_input_index][local_index] = input[pos]; + } + return; +} + +template +void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, T** outputs, const T* input, + cudaStream_t cuda_stream) { + Unpack<<>>(size, output_num, + dims_after_axis, outputs, input); + return; +} + +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, int8_t** outputs, const int8_t* input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, int16_t** outputs, const int16_t* input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, int** outputs, const int* input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, int64_t** outputs, const int64_t* input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, uint8_t** outputs, const uint8_t* input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, uint16_t** outputs, const uint16_t* input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, uint32_t** outputs, const uint32_t* input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, uint64_t** outputs, const uint64_t* input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, half** outputs, const half* input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, float** outputs, const float* input, + cudaStream_t cuda_stream); +template CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, bool** outputs, const bool* input, + cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh similarity index 53% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh index c4ea6e21026..420f12ff562 100755 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh @@ -14,12 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNPACKIMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNPACKIMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNPACKIMPL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNPACKIMPL_CUH_ +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, T** outputs, const T* input, - cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNPACKIMPL_H_ +CUDA_LIB_EXPORT void UnpackKernel(const size_t size, const size_t output_num, + const size_t dims_after_axis, T** outputs, const T* input, + cudaStream_t cuda_stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNPACKIMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cu similarity index 52% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cu index e3af209b663..850563e1c2e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cu @@ -14,8 +14,9 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh" #include +#include "include/cuda_fp16.h" template __global__ void UnsortedSegmentMax(const T *input, const S *segment_ids, const int64_t num_segments, size_t outer_size, @@ -71,19 +72,23 @@ void CalUnsortedSegmentMax(const T *input, const S *segment_ids, const int64_t n return; } -template void CalUnsortedSegmentMax(const float *input, const int *segment_ids, const int64_t num_segments, - size_t outer_size, size_t inner_size, float *output, - cudaStream_t stream); -template void CalUnsortedSegmentMax(const float *input, const int64_t *segment_ids, - const int64_t num_segments, size_t outer_size, size_t inner_size, - float *output, cudaStream_t stream); -template void CalUnsortedSegmentMax(const half *input, const int *segment_ids, const int64_t num_segments, - size_t outer_size, size_t inner_size, half *output, cudaStream_t stream); -template void CalUnsortedSegmentMax(const half *input, const int64_t *segment_ids, - const int64_t num_segments, size_t outer_size, size_t inner_size, - half *output, cudaStream_t stream); -template void CalUnsortedSegmentMax(const int *input, const int *segment_ids, const int64_t num_segments, - size_t outer_size, size_t inner_size, int *output, cudaStream_t stream); -template void CalUnsortedSegmentMax(const int *input, const int64_t *segment_ids, - const int64_t num_segments, size_t outer_size, size_t inner_size, - int *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalUnsortedSegmentMax(const float *input, const int *segment_ids, + const int64_t num_segments, size_t outer_size, + size_t inner_size, float *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalUnsortedSegmentMax(const float *input, const int64_t *segment_ids, + const int64_t num_segments, size_t outer_size, + size_t inner_size, float *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalUnsortedSegmentMax(const half *input, const int *segment_ids, + const int64_t num_segments, size_t outer_size, + size_t inner_size, half *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalUnsortedSegmentMax(const half *input, const int64_t *segment_ids, + const int64_t num_segments, size_t outer_size, + size_t inner_size, half *output, + cudaStream_t stream); +template CUDA_LIB_EXPORT void CalUnsortedSegmentMax(const int *input, const int *segment_ids, + const int64_t num_segments, size_t outer_size, + size_t inner_size, int *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalUnsortedSegmentMax(const int *input, const int64_t *segment_ids, + const int64_t num_segments, size_t outer_size, + size_t inner_size, int *output, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh similarity index 56% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh index 4f31f2e3826..86720c235ce 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh @@ -14,16 +14,15 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORTED_SEGMENT_MAX_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORTED_SEGMENT_MAX_H_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MAX_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MAX_CUH_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" // Setting warp size to sync data across threads #define KWARPSIZE 32 template -void CalUnsortedSegmentMax(const T *input, const S *segment_ids, const int64_t num_segments, size_t outer_size, - size_t inner_size, T *output, cudaStream_t stream); +CUDA_LIB_EXPORT void CalUnsortedSegmentMax(const T *input, const S *segment_ids, const int64_t num_segments, + size_t outer_size, size_t inner_size, T *output, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_MAX_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MAX_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cu similarity index 71% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cu rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cu index ec3141e0397..4e8271fe998 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cu @@ -14,8 +14,9 @@ * limitations under the License. */ -#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh" #include +#include "include/cuda_fp16.h" template __device__ __forceinline__ void max_val_init(T *init_val) { @@ -71,9 +72,12 @@ void CalUnsortedSegmentMin(const T *input, const int *segment_ids, const int64_t return; } -template void CalUnsortedSegmentMin(const float *input, const int *segment_ids, const int64_t num_segments, - size_t outer_size, size_t inner_size, float *output, cudaStream_t stream); -template void CalUnsortedSegmentMin(const half *input, const int *segment_ids, const int64_t num_segments, - size_t outer_size, size_t inner_size, half *output, cudaStream_t stream); -template void CalUnsortedSegmentMin(const int *input, const int *segment_ids, const int64_t num_segments, - size_t outer_size, size_t inner_size, int *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalUnsortedSegmentMin(const float *input, const int *segment_ids, + const int64_t num_segments, size_t outer_size, + size_t inner_size, float *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalUnsortedSegmentMin(const half *input, const int *segment_ids, + const int64_t num_segments, size_t outer_size, + size_t inner_size, half *output, cudaStream_t stream); +template CUDA_LIB_EXPORT void CalUnsortedSegmentMin(const int *input, const int *segment_ids, + const int64_t num_segments, size_t outer_size, + size_t inner_size, int *output, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh similarity index 55% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh index 4d8603a6f8c..335147731a4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh @@ -14,15 +14,14 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORTED_SEGMENT_MIN_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORTED_SEGMENT_MIN_H_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MIN_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MIN_CUH_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" // Setting warp size to sync data across threads #define KWARPSIZE 32 template -void CalUnsortedSegmentMin(const T *input, const int *segment_ids, const int64_t num_segments, size_t outer_size, - size_t inner_size, T *output, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_MIN_H_ +CUDA_LIB_EXPORT void CalUnsortedSegmentMin(const T *input, const int *segment_ids, const int64_t num_segments, + size_t outer_size, size_t inner_size, T *output, cudaStream_t stream); +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORTED_SEGMENT_MIN_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cu new file mode 100644 index 00000000000..c0ab224eab0 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cu @@ -0,0 +1,75 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" + +template +__global__ void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, + T* input_addr, S* ids_addr, T* output_addr) { + for (int input_index = blockIdx.x * blockDim.x + threadIdx.x; input_index < input_dim0 * input_dim1; + input_index += blockDim.x * gridDim.x) { + size_t j = input_index / input_dim1; + size_t k = input_index % input_dim1; + + S i = ids_addr[j]; + if (i < 0 || i >= output_dim0) { + continue; + } + size_t output_index = i * output_dim1 + k; + MsAtomicAdd(output_addr + output_index, input_addr[input_index]); + } +} + +template +void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, + T* input_addr, S* ids_addr, T* output_addr, cudaStream_t stream) { + int size = input_dim0 * input_dim1; + UnsortedSegmentSum<<>>(input_dim0, input_dim1, + output_dim0, output_dim1, input_addr, ids_addr, output_addr); + return; +} + +template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, + size_t output_dim1, double* input_addr, int* ids_addr, + double* output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, + size_t output_dim1, double* input_addr, int64_t* ids_addr, + double* output_addr, cudaStream_t stream); + +template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, + size_t output_dim1, float* input_addr, int* ids_addr, + float* output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, + size_t output_dim1, float* input_addr, int64_t* ids_addr, + float* output_addr, cudaStream_t stream); + +template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, + size_t output_dim1, half* input_addr, int* ids_addr, half* output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, + size_t output_dim1, half* input_addr, int64_t* ids_addr, + half* output_addr, cudaStream_t stream); + +template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, + size_t output_dim1, int* input_addr, int* ids_addr, int* output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, + size_t output_dim1, int* input_addr, int64_t* ids_addr, + int* output_addr, cudaStream_t stream); + + + diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh similarity index 54% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh index 65a9267ea7e..43a2fe9ba5f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh @@ -14,14 +14,13 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_SUM_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_SUM_H_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORT_SEGMENT_SUM_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORT_SEGMENT_SUM_CUH_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template -void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - T* input_addr, S* ids, T* output_addr, cudaStream_t stream); +CUDA_LIB_EXPORT void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, + T* input_addr, S* ids, T* output_addr, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_SUM_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UNSORT_SEGMENT_SUM_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/util.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh similarity index 98% rename from mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/util.cuh rename to mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh index b5aec5a361a..3dbc20374a8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/util.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh @@ -14,14 +14,11 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UTIL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UTIL_H_ - +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UTIL_CUH_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UTIL_CUH_ #include - #include - -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #define kThreadsPerBlock (256) #define kBlocksPerGrid(n) ((n + kThreadsPerBlock - 1) / kThreadsPerBlock) @@ -553,4 +550,4 @@ enum : unsigned { warp_size = 32, log_wap_size = 5 }; __device__ __forceinline__ unsigned LaneId() { return threadIdx.x & (warp_size - 1); } __device__ __forceinline__ unsigned WarpId(const unsigned &tid) { return tid >> log_wap_size; } -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UTIL_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_UTIL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cu deleted file mode 100644 index 47f2fe73d85..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cu +++ /dev/null @@ -1,138 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include "depthtospace_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void DepthToSpace(const size_t size, const T *input, const size_t in, - const size_t ic, const size_t ih, const size_t iw, - const size_t on, const size_t oc, const size_t oh, - const size_t ow, const size_t r, T *output) { - size_t temp_stride = 0; - size_t temp_pos = 0; - size_t input_pos = 0; - size_t output_pos_array[DEPTHTOSPACE_BUFFER_DIMENSION]; - - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; - pos += blockDim.x * gridDim.x) { - temp_stride = oc * oh * ow; - output_pos_array[0] = pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= oc; - output_pos_array[1] = temp_pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= oh; - output_pos_array[2] = temp_pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= ow; - output_pos_array[3] = temp_pos / temp_stride; - - input_pos += output_pos_array[0]; - input_pos = - (input_pos * ic) + - (output_pos_array[1] + - (r * (output_pos_array[2] % r) + output_pos_array[3] % r) * oc); - input_pos = (input_pos * ih) + (output_pos_array[2] / r); - input_pos = (input_pos * iw) + (output_pos_array[3] / r); - - output[pos] = input[input_pos]; - input_pos = 0; - } - return; -} - -template -void CalDepthToSpace(const size_t size, const T *input, const size_t in, - const size_t ic, const size_t ih, const size_t iw, - const size_t on, const size_t oc, const size_t oh, - const size_t ow, const size_t r, T *output, - cudaStream_t cuda_stream) { - DepthToSpace<<>>( - size, input, in, ic, ih, iw, on, oc, oh, ow, r, output); - return; -} - -template void CalDepthToSpace(const size_t size, const float *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, float *output, - cudaStream_t cuda_stream); -template void CalDepthToSpace(const size_t size, const half *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, half *output, - cudaStream_t cuda_stream); -template void CalDepthToSpace(const size_t size, const int *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, int *output, - cudaStream_t cuda_stream); -template void CalDepthToSpace(const size_t size, const int64_t *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, int64_t *output, - cudaStream_t cuda_stream); -template void CalDepthToSpace(const size_t size, const int16_t *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, int16_t *output, - cudaStream_t cuda_stream); -template void CalDepthToSpace(const size_t size, const int8_t *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, int8_t *output, - cudaStream_t cuda_stream); -template void CalDepthToSpace(const size_t size, const uint8_t *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, uint8_t *output, - cudaStream_t cuda_stream); -template void -CalDepthToSpace(const size_t size, const uint16_t *input, - const size_t in, const size_t ic, const size_t ih, - const size_t iw, const size_t on, const size_t oc, - const size_t oh, const size_t ow, const size_t r, - uint16_t *output, cudaStream_t cuda_stream); -template void -CalDepthToSpace(const size_t size, const uint32_t *input, - const size_t in, const size_t ic, const size_t ih, - const size_t iw, const size_t on, const size_t oc, - const size_t oh, const size_t ow, const size_t r, - uint32_t *output, cudaStream_t cuda_stream); -template void -CalDepthToSpace(const size_t size, const uint64_t *input, - const size_t in, const size_t ic, const size_t ih, - const size_t iw, const size_t on, const size_t oc, - const size_t oh, const size_t ow, const size_t r, - uint64_t *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cuh deleted file mode 100644 index 289ecc3673a..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cuh +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DEPTHTOSPACE_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DEPTHTOSPACE_H_ - -#define DEPTHTOSPACE_BUFFER_DIMENSION 4 -template -void CalDepthToSpace(const size_t size, const T *input, const size_t in, - const size_t ic, const size_t ih, const size_t iw, - const size_t on, const size_t oc, const size_t oh, - const size_t ow, const size_t r, T *output, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DEPTHTOSPACE_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cuh deleted file mode 100644 index 57d5ce552f7..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dropout_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DROPOUT_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DROPOUT_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void DropoutForward(const T *input, T *mask, T *output, float *mask_f, size_t num_count, float keep_prob, - cudaStream_t cuda_stream); -template -void DropoutBackward(const T *dy, const T *mask, T *dx, size_t num_count, float keep_prob, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DROPOUT_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cu deleted file mode 100644 index b292abe035a..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cu +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "dynamic_range_impl.cuh" - -#include - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void ValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta, - int64_t *output_shape, DynamicRangeErrorCode *error_code, - const int64_t max_output_size) { - T start = range_start[0]; - T end = range_end[0]; - T delta = range_delta[0]; - *error_code = DynamicRangeErrorCode::kOk; - - if (delta == 0) { - *error_code = DynamicRangeErrorCode::kDeltaIsZero; - return; - } - - if (start < end && delta < 0) { - *error_code = DynamicRangeErrorCode::kInvalidNegativeDelta; - return; - } - - if (start > end && delta > 0) { - *error_code = DynamicRangeErrorCode::kInvalidPositiveDelta; - return; - } - - if (*error_code == DynamicRangeErrorCode::kOk) { - int64_t real_output_shape = static_cast(ceil(static_cast(end - start) / delta)); - - // verification in case of precision error during calculation of real_output_shape. one multiplication followed by - // one addition is much more precise than the division that occurs when calculating real_output_shape. - double last_value = start + (delta * (real_output_shape - 1)); - double epsilon = 1e-6; - if ((end > start && last_value > end) || (start > end && last_value < end) || fabsf(last_value - end) < epsilon) { - real_output_shape--; - } - - if (real_output_shape > max_output_size) { - *error_code = DynamicRangeErrorCode::kMaxSizeExceeded; - } - *output_shape = real_output_shape; - } -} - -template -__global__ void Range(const T *range_start, const T *range_end, const T *range_delta, T *output, int64_t *output_shape, - const int64_t max_output_size) { - T start = range_start[0]; - T delta = range_delta[0]; - - size_t gt_id = blockIdx.x * blockDim.x + threadIdx.x; - for (; gt_id < *output_shape; gt_id += blockDim.x * gridDim.x) { - output[gt_id] = gt_id * delta + start; - } -} - -template -void CudaValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta, - int64_t *output_shape, DynamicRangeErrorCode *error_code, - const int64_t max_output_size, cudaStream_t cuda_stream) { - ValidateInputAndInferShape<<<1, 1, 0, cuda_stream>>>(range_start, range_end, range_delta, output_shape, error_code, - max_output_size); -} - -template -void CalRange(const T *range_start, const T *range_end, const T *range_delta, T *output, int64_t *output_shape, - DynamicRangeErrorCode *error_code, const int64_t max_output_size, cudaStream_t cuda_stream) { - Range<<>>(range_start, range_end, range_delta, - output, output_shape, max_output_size); -} - -template void CudaValidateInputAndInferShape(const int *range_start, const int *range_end, const int *range_delta, - int64_t *output_shape, DynamicRangeErrorCode *error_code, - const int64_t max_output_size, cudaStream_t cuda_stream); -template void CudaValidateInputAndInferShape(const int64_t *range_start, const int64_t *range_end, - const int64_t *range_delta, int64_t *output_shape, - DynamicRangeErrorCode *error_code, const int64_t max_output_size, - cudaStream_t cuda_stream); -template void CudaValidateInputAndInferShape(const float *range_start, const float *range_end, - const float *range_delta, int64_t *output_shape, - DynamicRangeErrorCode *error_code, const int64_t max_output_size, - cudaStream_t cuda_stream); -template void CudaValidateInputAndInferShape(const double *range_start, const double *range_end, - const double *range_delta, int64_t *output_shape, - DynamicRangeErrorCode *error_code, const int64_t max_output_size, - cudaStream_t cuda_stream); - -template void CalRange(const int *range_start, const int *range_end, const int *range_delta, int *output, - int64_t *output_shape, DynamicRangeErrorCode *error_code, const int64_t max_output_size, - cudaStream_t cuda_stream); -template void CalRange(const int64_t *range_start, const int64_t *range_end, const int64_t *range_delta, - int64_t *output, int64_t *output_shape, DynamicRangeErrorCode *error_code, - const int64_t max_output_size, cudaStream_t cuda_stream); -template void CalRange(const float *range_start, const float *range_end, const float *range_delta, float *output, - int64_t *output_shape, DynamicRangeErrorCode *error_code, const int64_t max_output_size, - cudaStream_t cuda_stream); -template void CalRange(const double *range_start, const double *range_end, const double *range_delta, - double *output, int64_t *output_shape, DynamicRangeErrorCode *error_code, - const int64_t max_output_size, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cuh deleted file mode 100644 index 535e3443031..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cuh +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_RANGE_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_RANGE_CUH_ - -#include - -enum class DynamicRangeErrorCode { - kOk = 0, - kDeltaIsZero, - kInvalidPositiveDelta, - kInvalidNegativeDelta, - kMaxSizeExceeded -}; - -template -void CudaValidateInputAndInferShape(const T *range_start, const T *range_end, const T *range_delta, - int64_t *output_shape, DynamicRangeErrorCode *error_code, - const int64_t max_output_size, cudaStream_t cuda_stream); - -template -void CalRange(const T *range_start, const T *range_end, const T *range_delta, T *output, int64_t *output_shape, - DynamicRangeErrorCode *error_code, const int64_t max_output_size, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_DYNAMIC_RANGE_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cuh deleted file mode 100644 index f0876f283eb..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/einsum_impl.cuh +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright 2022 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_EINSUM_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_EINSUM_H_ -#include "plugin/device/gpu/hal/device/cuda_common.h" -#define EINSUM_MAX_DIMENSION 20 -template -struct DynamicSharedMem; -template <> -struct DynamicSharedMem { - __device__ double *addr() { - extern __shared__ double addr_double[]; - return addr_double; - } -}; -template <> -struct DynamicSharedMem { - __device__ float *addr() { - extern __shared__ float addr_float[]; - return addr_float; - } -}; -template <> -struct DynamicSharedMem { - __device__ half *addr() { - extern __shared__ half addr_half[]; - return addr_half; - } -}; -template -void CalDiagonal(const size_t size, const T *input, const size_t *input_shape, const size_t shape_size, - const size_t left_dim, const size_t right_dim, T *output, cudaStream_t cuda_stream); -template -void CalDiagonalGrad(const size_t d_size, const T *dout, const size_t *input_shape, const size_t shape_size, - const size_t left_dim, const size_t right_dim, T *d_inp, cudaStream_t cuda_stream); -template -void CalDot(const size_t size, T *input_a, const T *input_b, T *output, cudaStream_t cuda_stream); -template -void CalDotGrad(const size_t size, const T dout, T *mid_res, T *input_b, T *input_a, cudaStream_t cuda_stream); -template -void CalMul(const bool broadcast_flag, const size_t shape_len, const size_t *lft_shape, const size_t lft_num, - const size_t *rht_shape, const size_t rht_num, const size_t *out_shape, const size_t out_num, const T *x0, - const T *x1, T *y, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_EINSUM_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cu deleted file mode 100644 index 740716adcba..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cu +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void SubOffset(T *indices, size_t size, int64_t offset) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { - indices[pos] -= static_cast(offset); - } - return; -} - -template -void CalEmbeddingLookup(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2, - size_t input_dim1, int64_t offset, cudaStream_t stream) { - size_t size = output_dim0 * output_dim1 * output_dim2; - SubOffset<<>>(indices, output_dim1, offset); - GatherV2Kernel<<>>(input, indices, output, output_dim0, output_dim1, - output_dim2, input_dim1); - // restore indices - SubOffset<<>>(indices, output_dim1, -offset); - return; -} - -template void CalEmbeddingLookup(float *input, int *indices, float *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset, - cudaStream_t stream); -template void CalEmbeddingLookup(float *input, int64_t *indices, float *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - int64_t offset, cudaStream_t stream); -template void CalEmbeddingLookup(half *input, int *indices, half *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset, - cudaStream_t stream); -template void CalEmbeddingLookup(half *input, int64_t *indices, half *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - int64_t offset, cudaStream_t stream); -template void CalEmbeddingLookup(double *input, int *indices, double *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset, - cudaStream_t stream); -template void CalEmbeddingLookup(double *input, int64_t *indices, double *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - int64_t offset, cudaStream_t stream); -template void CalEmbeddingLookup(int *input, int *indices, int *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset, - cudaStream_t stream); -template void CalEmbeddingLookup(int *input, int64_t *indices, int *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - int64_t offset, cudaStream_t stream); -template void CalEmbeddingLookup(int16_t *input, int *indices, int16_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - int64_t offset, cudaStream_t stream); -template void CalEmbeddingLookup(int16_t *input, int64_t *indices, int16_t *output, - size_t output_dim0, size_t output_dim1, size_t output_dim2, - size_t input_dim1, int64_t offset, cudaStream_t stream); -template void CalEmbeddingLookup(int8_t *input, int *indices, int8_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset, - cudaStream_t stream); -template void CalEmbeddingLookup(int8_t *input, int64_t *indices, int8_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - int64_t offset, cudaStream_t stream); -template void CalEmbeddingLookup(uint8_t *input, int *indices, uint8_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - int64_t offset, cudaStream_t stream); -template void CalEmbeddingLookup(uint8_t *input, int64_t *indices, uint8_t *output, - size_t output_dim0, size_t output_dim1, size_t output_dim2, - size_t input_dim1, int64_t offset, cudaStream_t stream); -template void CalEmbeddingLookup(bool *input, int *indices, bool *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, int64_t offset, - cudaStream_t stream); -template void CalEmbeddingLookup(bool *input, int64_t *indices, bool *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - int64_t offset, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cu deleted file mode 100644 index ae843420306..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cu +++ /dev/null @@ -1,102 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh" - -template -__global__ void ExtractImagePatches(size_t output_size, int64_t stride_row, int64_t stride_col, int64_t rate_row, - int64_t rate_col, int64_t output_cols, bool need_batch, int64_t row_stride, - int64_t patch_stride, int64_t other_stride, int64_t input_row_size, - int64_t input_col_size, int64_t row_padding_top, int64_t col_padding_left, - int64_t col_input_stride, int64_t row_input_stride, int64_t patch_input_stride, - int64_t output_depth, const T *input, T *output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_size; pos += blockDim.x * gridDim.x) { - const int64_t batch_index = need_batch ? (static_cast(pos) / other_stride) : 0; - const int64_t inner_index = - need_batch ? (static_cast(pos) - batch_index * other_stride) : static_cast(pos); - // inner index - const int64_t patch_index = inner_index / patch_stride; - const int64_t patch_offset = (inner_index - patch_index * patch_stride) / output_depth; - // row - const int64_t row_index = patch_index / output_cols; - const int64_t row_offset = patch_offset / row_stride; - const int64_t input_row = row_index * stride_row + row_offset * rate_row - row_padding_top; - if (input_row < 0 || input_row >= input_row_size) { - output[pos] = static_cast(0); - continue; - } - // col - const int64_t col_index = patch_index - row_index * output_cols; - const int64_t col_offset = patch_offset - row_offset * row_stride; - const int64_t input_col = col_index * stride_col + col_offset * rate_col - col_padding_left; - if (input_col < 0 || input_col >= input_col_size) { - output[pos] = static_cast(0); - continue; - } - // depth - const int64_t depth = inner_index - (inner_index / output_depth) * output_depth; - // input index - const int64_t input_index = - depth + input_col * col_input_stride + input_row * row_input_stride + batch_index * patch_input_stride; - output[pos] = input[static_cast(input_index)]; - } - return; -} - -template -void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, int64_t rate_row, - int64_t rate_col, int64_t output_cols, bool need_batch, int64_t row_stride, - int64_t patch_stride, int64_t other_stride, int64_t input_row_size, - int64_t input_col_size, int64_t row_padding_top, int64_t col_padding_left, - int64_t col_input_stride, int64_t row_input_stride, int64_t patch_input_stride, - int64_t output_depth, const T *input, T *output, cudaStream_t stream) { - ExtractImagePatches<<>>( - output_size, stride_row, stride_col, rate_row, rate_col, output_cols, need_batch, row_stride, patch_stride, - other_stride, input_row_size, input_col_size, row_padding_top, col_padding_left, col_input_stride, row_input_stride, - patch_input_stride, output_depth, input, output); -} - -template void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, - int64_t rate_row, int64_t rate_col, int64_t output_cols, bool need_batch, - int64_t row_stride, int64_t patch_stride, int64_t other_stride, - int64_t input_row_size, int64_t input_col_size, int64_t row_padding_top, - int64_t col_padding_left, int64_t col_input_stride, - int64_t row_input_stride, int64_t patch_input_stride, - int64_t output_depth, const int *input, int *output, cudaStream_t stream); -template void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, - int64_t rate_row, int64_t rate_col, int64_t output_cols, - bool need_batch, int64_t row_stride, int64_t patch_stride, - int64_t other_stride, int64_t input_row_size, int64_t input_col_size, - int64_t row_padding_top, int64_t col_padding_left, - int64_t col_input_stride, int64_t row_input_stride, - int64_t patch_input_stride, int64_t output_depth, const float *input, - float *output, cudaStream_t stream); -template void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, - int64_t rate_row, int64_t rate_col, int64_t output_cols, bool need_batch, - int64_t row_stride, int64_t patch_stride, int64_t other_stride, - int64_t input_row_size, int64_t input_col_size, int64_t row_padding_top, - int64_t col_padding_left, int64_t col_input_stride, - int64_t row_input_stride, int64_t patch_input_stride, - int64_t output_depth, const half *input, half *output, - cudaStream_t stream); -template void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, - int64_t rate_row, int64_t rate_col, int64_t output_cols, - bool need_batch, int64_t row_stride, int64_t patch_stride, - int64_t other_stride, int64_t input_row_size, int64_t input_col_size, - int64_t row_padding_top, int64_t col_padding_left, - int64_t col_input_stride, int64_t row_input_stride, - int64_t patch_input_stride, int64_t output_depth, const double *input, - double *output, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh deleted file mode 100644 index baaf80b611a..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EXTRACT_IMAGE_PATCHES_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EXTRACT_IMAGE_PATCHES_IMPL_CUH_ - -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void CalExtractImagePatchesNHWC(size_t output_size, int64_t stride_row, int64_t stride_col, int64_t rate_row, - int64_t rate_col, int64_t output_cols, bool need_batch, int64_t row_stride, - int64_t patch_stride, int64_t other_stride, int64_t input_row_size, - int64_t input_col_size, int64_t row_padding_top, int64_t col_padding_left, - int64_t col_input_stride, int64_t row_input_stride, int64_t patch_input_stride, - int64_t output_depth, const T *input, T *output, cudaStream_t stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_EXTRACT_IMAGE_PATCHES_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh deleted file mode 100644 index 432456116a3..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh +++ /dev/null @@ -1,33 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -void CalLSQNudgePerChannel(const float *input, const int size, float *input_alpha, float *input_quant_max, - float *input_div_alpha, float *input_quant, const bool neg_trunc, const int channel_num, - cudaStream_t cuda_stream); - -void CalFakeLearnedScaleQuantPerChannel(float *output, const int size, float *input_alpha, float *input_quant, - const int channel_num, cudaStream_t cuda_stream); - -void CalFakeLearnedScaleQuantPerChannelGrad(float *grad_input, float *grad_alpha, const float *gradient, const int size, - const float *input_div_alpha, const float *input_quant, - const bool neg_trunc, const int channel_num, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh deleted file mode 100644 index 26ca59bddee..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERLAYER_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERLAYER_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -void CalLSQNudgePerLayer(const float *input, const int size, float *input_alpha, float *input_quant_max, - float *input_div_alpha, float *input_quant, const bool neg_trunc, cudaStream_t cuda_stream); - -void CalFakeLearnedScaleQuantPerLayer(float *output, const int size, float *input_alpha, float *input_quant, - cudaStream_t cuda_stream); - -void CalFakeLearnedScaleQuantPerLayerGrad(float *grad_input, float *grad_alpha, const float *gradient, const int size, - const float *input_div_alpha, const float *input_quant, const bool neg_trunc, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERLAYER_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cuh deleted file mode 100644 index 36ca41adc91..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cuh +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERCHANNEL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERCHANNEL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -void CalNudgePerChannel(float *input_min, float *input_max, const float quant_min, const float quant_max, - float *nudge_min, float *nudge_max, float *scale, const int channel_num, const bool symmetric, - cudaStream_t cuda_stream); - -void CalFakeQuantPerChannel(const float *input, float *output, const int total_num, const int channel_num, - const float *nudge_min, const float *nudge_max, const float *scale, - cudaStream_t cuda_stream); - -void CalFakeQuantPerChannelGrad(const float *input, const float *gradient, float *output, const int total_num, - const int channel_num, const float *nudge_min, const float *nudge_max, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERCHANNEL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cuh deleted file mode 100644 index 7884c3130b6..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cuh +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERLAYER_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERLAYER_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -void CalNudgePerLayer(float *input_min, float *input_max, const float quant_min, const float quant_max, - float *nudge_min, float *nudge_max, float *scale, const bool symmetric, cudaStream_t cuda_stream); - -void CalFakeQuantPerLayer(const float *input, float *output, const int size, const float *nudge_min, - const float *nudge_max, const float *scale, cudaStream_t cuda_stream); - -void CalFakeQuantPerLayerGrad(const float *input, const float *gradient, float *output, const int size, - const float *nudge_min, const float *nudge_max, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERLAYER_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cuh deleted file mode 100644 index bc078cbc681..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/float_status_impl.cuh +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FLOATSTATUS_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FLOATSTATUS_H_ -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void CalFloatStatus(const size_t size, const T *input, float *output, cudaStream_t stream); -template -void CalIsNan(const size_t size, const T *input, bool *output, cudaStream_t stream); -template -void CalIsInf(const size_t size, const T *input, bool *output, cudaStream_t stream); -template -void CalIsFinite(const size_t size, const T *input, bool *output, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_FLOATSTATUS_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cu deleted file mode 100755 index 3b6e5dd163f..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather.cu +++ /dev/null @@ -1,148 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "plugin/device/gpu/kernel/cuda_impl/gather.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -__global__ void GatherKernel(const T *input, const S *index, T *output, const size_t dim_before_axis, - const size_t dim_at_axis_input, const size_t dim_at_axis_output, - const size_t dim_after_axis) { - size_t num = dim_before_axis * dim_at_axis_output * dim_after_axis; - size_t i, k; - for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < num; - id += blockDim.x * gridDim.x) { - i = id / (dim_at_axis_output * dim_after_axis); - k = id % dim_after_axis; - - S j = index[id]; - if (j < 0) { - j += static_cast(dim_at_axis_input); - } - CUDA_KERNEL_ASSERT(j >= 0); - size_t j_read = static_cast(j); - CUDA_KERNEL_ASSERT(j_read < dim_at_axis_input); - size_t read_id = i * dim_at_axis_input * dim_after_axis + j_read * dim_after_axis + k; - output[id] = input[read_id]; - } - return; -} -template -void Gather(const T *input, const S *index, T *output, const size_t dim_before_axis, - const size_t dim_at_axis_input, const size_t dim_at_axis_output, - const size_t dim_after_axis, cudaStream_t stream) { - size_t size = dim_before_axis * dim_at_axis_output * dim_after_axis; - GatherKernel<<>>(input, index, output, dim_before_axis, dim_at_axis_input, - dim_at_axis_output, dim_after_axis); - return; -} - -template void Gather(const double *input, const int *index, double *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const double *input, const int64_t *index, double *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const float *input, const int *index, float *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const float *input, const int64_t *index, float *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const half *input, const int *index, half *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const half *input, const int64_t *index, half *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const int64_t *input, const int *index, int64_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const int64_t *input, const int64_t *index, int64_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const int *input, const int *index, int *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const int *input, const int64_t *index, int *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const int16_t *input, const int *index, int16_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const int16_t *input, const int64_t *index, int16_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const int8_t *input, const int *index, int8_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const int8_t *input, const int64_t *index, int8_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const unsigned char *input, const int *index, unsigned char *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const unsigned char *input, const int64_t *index, unsigned char *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const bool *input, const int *index, bool *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const bool *input, const int64_t *index, bool *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const uint16_t *input, const int *index, uint16_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const uint16_t *input, const int64_t *index, uint16_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const uint32_t *input, const int *index, uint32_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const uint32_t *input, const int64_t *index, uint32_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const uint64_t *input, const int *index, uint64_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void Gather(const uint64_t *input, const int64_t *index, uint64_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_input, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cu deleted file mode 100755 index 7e0136caa49..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gather_grad.cu +++ /dev/null @@ -1,151 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void GatherGradKernel(const size_t num, const T *index, const S *grad, S *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis) { - size_t i, k; - - for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < num; - id += blockDim.x * gridDim.x) { - i = id / (dim_at_axis_index * dim_after_axis); - k = id % dim_after_axis; - - T j = index[id]; - if (j < 0) { - j += static_cast(dim_at_axis_output); - } - CUDA_KERNEL_ASSERT(j >= 0); - size_t j_read = static_cast(j); - CUDA_KERNEL_ASSERT(j_read < dim_at_axis_output); - size_t read_id = i * dim_at_axis_output * dim_after_axis + j_read * dim_after_axis + k; - MsAtomicAdd(output + read_id, grad[id]); - } - return; -} - -template -__global__ void InitOutput(const size_t size, S *output) { - S zero = 0; - for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < size; id += blockDim.x * gridDim.x) { - output[id] = zero; - } - return; -} - -template -void GatherGrad(const T *index, const S *grad, S *output, const size_t dim_before_axis, - const size_t dim_at_axis_index, const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream) { - size_t size = dim_before_axis * dim_at_axis_output * dim_after_axis; - InitOutput<<>>(size, output); - - size = dim_before_axis * dim_at_axis_index * dim_after_axis; - GatherGradKernel<<>>(size, index, grad, output, - dim_before_axis, dim_at_axis_index, - dim_at_axis_output, dim_after_axis); - return; -} - -template void GatherGrad(const int *index, const double *grad, double *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int64_t *index, const double *grad, double *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int *index, const float *grad, float *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int64_t *index, const float *grad, float *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int *index, const half *grad, half *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int64_t *index, const half *grad, half *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int *index, const int *grad, int *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int64_t *index, const int *grad, int *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int *index, const int8_t *grad, int8_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int64_t *index, const int8_t *grad, int8_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int *index, const int16_t *grad, int16_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int64_t *index, const int16_t *grad, int16_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int *index, const int64_t *grad, int64_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int64_t *index, const int64_t *grad, int64_t *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int *index, const unsigned char *grad, unsigned char *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int64_t *index, const unsigned char *grad, unsigned char *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int *index, const unsigned int *grad, unsigned int *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int64_t *index, const unsigned int *grad, unsigned int *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int *index, const bool *grad, bool *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); -template void GatherGrad(const int64_t *index, const bool *grad, bool *output, - const size_t dim_before_axis, const size_t dim_at_axis_index, - const size_t dim_at_axis_output, const size_t dim_after_axis, - cudaStream_t stream); - - - diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cu deleted file mode 100644 index ff9f7033567..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gathernd.cu +++ /dev/null @@ -1,114 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/gathernd.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -__global__ void GatherNdKernel(T *input, S *indices, T *output, const size_t output_dim0, const size_t output_dim1, - const size_t indices_dim1, S *batch_indices, S *batch_strides) { - int num = output_dim0 * output_dim1; - int i, j; - for (int write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; - write_index += blockDim.x * gridDim.x) { - i = write_index / output_dim1 % output_dim0; - j = write_index % output_dim1; - - bool out_of_bound = false; - int read_index = 0; - int indices_i = 0; - for (size_t k = 0; k < indices_dim1; k++) { - size_t ind = indices_dim1 * i + k; - indices_i = indices[ind]; - out_of_bound |= !(indices_i < batch_indices[k]); - read_index += indices_i * batch_strides[k]; - } - read_index += j; - - if (!out_of_bound) { - output[write_index] = input[read_index]; - } else { - output[write_index] = 0; - } - } - return; -} -template -void GatherNd(T *input, S *indices, T *output, const size_t &output_dim0, const size_t &output_dim1, - const size_t &indices_dim1, S *batch_indices, S *batch_strides, cudaStream_t stream) { - int size = output_dim0 * output_dim1; - GatherNdKernel<<>>(input, indices, output, output_dim0, output_dim1, - indices_dim1, batch_indices, batch_strides); - return; -} - -template void GatherNd(double *input, int *indices, double *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, - int *batch_strides, cudaStream_t stream); -template void GatherNd(float *input, int *indices, float *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, - int *batch_strides, cudaStream_t stream); -template void GatherNd(half *input, int *indices, half *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, - int *batch_strides, cudaStream_t stream); -template void GatherNd(int *input, int *indices, int *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, - int *batch_strides, cudaStream_t stream); -template void GatherNd(short *input, int *indices, short *output, const size_t &output_dim0, // NOLINT - const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, - int *batch_strides, cudaStream_t stream); -template void GatherNd(unsigned int *input, int *indices, unsigned int *output, - const size_t &output_dim0, const size_t &output_dim1, - const size_t &indices_dim1, int *batch_indices, int *batch_strides, - cudaStream_t stream); -template void GatherNd(char *input, int *indices, char *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, - int *batch_strides, cudaStream_t stream); -template void GatherNd(unsigned char *input, int *indices, unsigned char *output, - const size_t &output_dim0, const size_t &output_dim1, - const size_t &indices_dim1, int *batch_indices, int *batch_strides, - cudaStream_t stream); -template void GatherNd(bool *input, int *indices, bool *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, - int *batch_strides, cudaStream_t stream); -template void GatherNd(double *input, int64_t *indices, double *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices, - int64_t *batch_strides, cudaStream_t stream); -template void GatherNd(float *input, int64_t *indices, float *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices, - int64_t *batch_strides, cudaStream_t stream); -template void GatherNd(half *input, int64_t *indices, half *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices, - int64_t *batch_strides, cudaStream_t stream); -template void GatherNd(int *input, int64_t *indices, int *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices, - int64_t *batch_strides, cudaStream_t stream); -template void GatherNd(short *input, int64_t *indices, short *output, // NOLINT - const size_t &output_dim0, const size_t &output_dim1, const size_t &indices_dim1, - int64_t *batch_indices, int64_t *batch_strides, cudaStream_t stream); -template void GatherNd(unsigned int *input, int64_t *indices, unsigned int *output, - const size_t &output_dim0, const size_t &output_dim1, - const size_t &indices_dim1, int64_t *batch_indices, - int64_t *batch_strides, cudaStream_t stream); -template void GatherNd(char *input, int64_t *indices, char *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices, - int64_t *batch_strides, cudaStream_t stream); -template void GatherNd(unsigned char *input, int64_t *indices, unsigned char *output, - const size_t &output_dim0, const size_t &output_dim1, - const size_t &indices_dim1, int64_t *batch_indices, - int64_t *batch_strides, cudaStream_t stream); -template void GatherNd(bool *input, int64_t *indices, bool *output, const size_t &output_dim0, - const size_t &output_dim1, const size_t &indices_dim1, int64_t *batch_indices, - int64_t *batch_strides, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cu deleted file mode 100755 index 6e895d5f0f2..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/gatherv2.cu +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Copyright 2019-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -__global__ void GatherV2Kernel(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, - size_t output_dim2, size_t input_dim1) { - size_t num = output_dim0 * output_dim1 * output_dim2; - size_t i, j, k; - for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; - write_index += blockDim.x * gridDim.x) { - i = write_index / (output_dim1 * output_dim2) % output_dim0; - j = write_index / output_dim2 % output_dim1; - k = write_index % output_dim2; - - if ((indices[j] >= 0) && (indices[j] < input_dim1)) { - size_t read_index = i * input_dim1 * output_dim2 + indices[j] * output_dim2 + k; - output[write_index] = input[read_index]; - } else { - output[write_index] = 0; - } - } - - return; -} -template -void GatherV2(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2, - size_t input_dim1, cudaStream_t stream) { - size_t size = output_dim0 * output_dim1 * output_dim2; - GatherV2Kernel<<>>(input, indices, output, output_dim0, output_dim1, - output_dim2, input_dim1); - return; -} - -template void GatherV2(float *input, int *indices, float *output, size_t output_dim0, size_t output_dim1, - size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(float *input, int64_t *indices, float *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(half *input, int *indices, half *output, size_t output_dim0, size_t output_dim1, - size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(half *input, int64_t *indices, half *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(double *input, int *indices, double *output, size_t output_dim0, size_t output_dim1, - size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(double *input, int64_t *indices, double *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(int *input, int *indices, int *output, size_t output_dim0, size_t output_dim1, - size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(int *input, int64_t *indices, int *output, size_t output_dim0, size_t output_dim1, - size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(int16_t *input, int *indices, int16_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(int16_t *input, int64_t *indices, int16_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - cudaStream_t stream); -template void GatherV2(int8_t *input, int *indices, int8_t *output, size_t output_dim0, size_t output_dim1, - size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(int8_t *input, int64_t *indices, int8_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(uint32_t *input, int *indices, uint32_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(uint32_t *input, int64_t *indices, uint32_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - cudaStream_t stream); -template void GatherV2(uint8_t *input, int *indices, uint8_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(uint8_t *input, int64_t *indices, uint8_t *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - cudaStream_t stream); -template void GatherV2(bool *input, int *indices, bool *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, cudaStream_t stream); -template void GatherV2(bool *input, int64_t *indices, bool *output, size_t output_dim0, - size_t output_dim1, size_t output_dim2, size_t input_dim1, - cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cuh deleted file mode 100755 index e13d2dc124b..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/hash_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_PS_PS_CACHE_KERNEL_HASH_IMPL_H_ -#define MINDSPORE_CCSRC_PS_PS_CACHE_KERNEL_HASH_IMPL_H_ - -template -void DoHashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size, - const int hash_dim, cudaStream_t cuda_stream); - -template -void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size, - const int hash_dim, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_PS_PS_CACHE_KERNEL_HASH_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh deleted file mode 100644 index 22791f91f7e..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_INSTANCE_NORM_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_INSTANCE_NORM_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -void CopyMemDevice2Device(const size_t N, const size_t C, - float *gamma_addr, float *beta_addr, float *runing_mean_addr, float *runnig_variance_addr, - float *ws_gamma, float *ws_beta, float *ws_mean, float *ws_var, - cudaStream_t cuda_stream); -void ComputeMean(const size_t N, const size_t C, float *dgamma, float *dbeta, const float *ws_dgamma, - const float *ws_dbeta, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_INSTANCE_NORM_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cuh deleted file mode 100644 index 1f11270e583..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cuh +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_GRAD_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_GRAD_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void LayerNormGradGrad(const int& row_dim, const int& col_dim, const int& param_dim, T* global_sum1, T* global_sum2, - const T& epsilon, const T* dy, const T* x, const T* mean, const T* var, const T* gamma, - const T* grad_dx, const T* grad_dg, const T* grad_db, T* d_dy, T* d_x, T* d_gamma, - cudaStream_t stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_GRAD_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cuh deleted file mode 100644 index 0b292600b6c..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void LayerNormGrad(const int& row_dim, const int& col_dim, const int& param_dim, const T& epsilon, const T* dy, - const T* x, const T* mean, const T* var, const T* gamma, T* dx, T* dg, T* db, cudaStream_t stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LAYER_NORM_GRAD_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh deleted file mode 100644 index 7493f70d247..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOCAL_RESPONSE_NORM_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOCAL_RESPONSE_NORM_H_ -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void CalLocalResponseNormNHWC(const T *input, const int depth_radius, const float bias, const float alpha, - const float beta, const size_t channels, const size_t num_elements, float *scale, T *output, - cudaStream_t cuda_stream); - -template -void CalLocalResponseNormGradNHWC(const T *dy, const T *x, const T *y, const int depth_radius, const float bias, - const float alpha, const float beta, const size_t channels, const size_t num_elements, float *scale, T *dx, - cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LOCAL_RESPONSE_NORM_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh deleted file mode 100644 index bbb3137e8d7..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LOSS_WITH_REDUCTION_IMPL_CUH -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LOSS_WITH_REDUCTION_IMPL_CUH - -#include -#include - -enum class ReductionMode { kNone, kMean, kSum }; - -static std::map kReductionModeMap{ - {"none", ReductionMode::kNone}, {"mean", ReductionMode::kMean}, {"sum", ReductionMode::kSum}}; - -template -void BinaryCrossEntropyLoss(const int &input_size, const ReductionMode &reduction, const T *input_x, const T *input_y, - const T *weight, T *loss, T *tmp_loss, cudaStream_t stream); -template -void BinaryCrossEntropyLossGrad(const int &input_size, const ReductionMode &reduction, const T *input_x, - const T *input_y, const T *weight, const T *dloss, T *dx, cudaStream_t stream); -template -void KLDivLoss(const int &input_size, const ReductionMode &reduction, const T *input_x, const T *input_y, T *loss, - T *tmp_loss, cudaStream_t stream); -template -void KLDivLossGrad(const int &input_size, const ReductionMode &reduction, const T *input_x, const T *input_y, - const T *dloss, T *dx, T *dy, cudaStream_t stream); -template -void NLLLoss(const int n, const int c, const ReductionMode reduction, const T *input, const int32_t *target, - const S *weight, T *loss, S *total_weight, T *tmp_loss, S *tmp_target_weight, cudaStream_t stream); -template -void NLLLossGrad(const int n, const int c, const ReductionMode reduction, const T *input, const int32_t *target, - const S *weight, const S *total_weight, const T *dloss, T *dinput, cudaStream_t stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LOSS_WITH_REDUCTION_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cuh deleted file mode 100644 index f7056fc1885..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_combine_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXCOMBINE_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXCOMBINE_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void MatrixCombine(const size_t size, const size_t src_height, const size_t src_width, const size_t dst_width, - const size_t residual, const size_t res_width, const size_t batch, T *input_addr, T *output_addr, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MATRIXCOMBINE_H_ - diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cuh deleted file mode 100644 index ea49f67e7b1..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2022 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_DIAG_PART_IMPL_CUH -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_DIAG_PART_IMPL_CUH - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void MatrixDiagPart(const size_t size, const T *input_matrix_addr, const size_t m, const size_t n, const int64_t l, - const int64_t u, const size_t num_diags, const size_t max_diag_len, const int64_t la, - const int64_t ua, T *padding_value, T *output_addr, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_MATRIX_DIAG_PART_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cuh deleted file mode 100644 index 19c42579013..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2022 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MATRIX_SET_DIAG_IMPL_CUH_ -#define MINDSPORE_MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MATRIX_SET_DIAG_IMPL_CUH_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void MatrixSetDiag(const int outer_batch, const int inner_row, const int inner_col, const int num_diags, - const int max_diag_len, const int lower_index, const int upper_index, - const bool right_align_super_diagonal, const bool right_align_sub_diagonal, - const bool is_single_diag, const T *diag_addr, T *output_addr, cudaStream_t cuda_stream); - -#endif // MINDSPORE_MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MATRIX_SET_DIAG_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cuh deleted file mode 100644 index a624d03bbaf..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cuh +++ /dev/null @@ -1,23 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_ -template -void CalMaxPoolWithArgmaxGrad(const T* dy, const S* index, const int n, const int c, const int xHeight, - const int xWidth, const int dyHeight, const int dyWidth, T* dx, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cuh deleted file mode 100644 index 8b088067edc..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cuh +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_ -template -void CalMaxPoolWithArgmax(const T* input, const int n, const int c, const int h, const int w, const int windowHeight, - const int windowWidth, const int strideHeight, const int strideWidth, const int padTop, - const int padLeft, const int outputHeight, const int outputWidth, T* output, S *index, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cuh deleted file mode 100644 index bdbc7654c53..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cuh +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -void CalMinMaxPerChannel(float *input, float *input_min, float *input_max, float *output_min, float *output_max, - const int total_num, const int channel_num, const float ema_decay, const bool ema, - cudaStream_t cuda_stream); - -void CalMinMaxPerLayer(float *input, float *input_min, float *input_max, float *output_min, float *output_max, - const int size, const float ema_decay, const bool ema, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh deleted file mode 100755 index d85f49fcd96..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MIRROR_PAD_IMPL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MIRROR_PAD_IMPL_H_ -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" - -// preset size of paddings -#define MAX_PADDINGS 4 -#define PADDING_SIZE 2 - -// define constants for kernel indexing use -#define BATCH 0 * PADDING_SIZE -#define CHANNEL 1 * PADDING_SIZE -#define HEIGHT 2 * PADDING_SIZE -#define WIDTH 3 * PADDING_SIZE -#define TOP 0 -#define BOTTOM 1 -#define LEFT 0 -#define RIGHT 1 - -template -void CalMirrorPad(const size_t size, const T *input, const int old_batch, const int old_channel, const int old_height, - const int old_width, const int padded_height, const int padded_width, int padd_num, - const int64_t *paddings, int mode, T *output, cudaStream_t cuda_stream); -template -void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, T *dy, T *interim, const int output_batch, - const int output_channel, const int output_height, const int output_width, const int input_height, - const int input_width, const int padd_dim, const int64_t *paddings, int mode, T *dx, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MIRROR_PAD_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh deleted file mode 100644 index aec380a6c91..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient, - const S *momentum, bool use_nesterov, cudaStream_t cuda_stream); -template -void FusedWeightDecayScaleMomentum(const size_t element_num, T *weight_decay, T *scale, T *variable, T *accumulation, - const T *learning_rate, const S *gradient, const T *momentum, - cudaStream_t cuda_stream); -template -void FusedWeightDecayMomentum(const size_t element_num, T *weight_decay, T *variable, T *accumulation, - const T *learning_rate, const S *gradient, const T *momentum, cudaStream_t cuda_stream); -template -void FusedScaleMomentum(const size_t element_num, T *scale, T *variable, T *accumulation, const T *learning_rate, - const S *gradient, const T *momentum, cudaStream_t cuda_stream); -template -void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *element, T **weight_decay, - T **scale, T **variable, T **accumulation, T **learning_rate, S **gradient, - T **momentum, cudaStream_t cuda_stream); -template -void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *element, T **scale, T **variable, - T **accumulation, T **learning_rate, S **gradient, T **momentum, - cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cuh deleted file mode 100644 index b30b5b43c66..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cuh +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MULTINOMIAL_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MULTINOMIAL_IMPL_CUH_ -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" - -void InitRandState(int seed, int num, curandState *state, cudaStream_t stream); -template -void Multinomial(int row, int col, T *probs, curandState *rand_state, int64_t *num_sample, int *output, - cudaStream_t stream); -template -void CheckNonNeg(const size_t size, const T *input, T *output, cudaStream_t stream); -template -void CheckZero(const size_t distributions, const size_t categories, const T *input, T *output, cudaStream_t stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MULTINOMIAL_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cuh deleted file mode 100644 index d3e380b3d3a..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cuh +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_NMS_WITH_MASK_IMPL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_NMS_WITH_MASK_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void CalSort(const int &inner, T *data_in, T *data_out, int *index_buff, T *data_buff, int box_size_, - cudaStream_t stream); - -template -void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *input, T *output, int *index_buff, int box_size_, - bool *row_mask, cudaStream_t cuda_stream); - -template -void CalNms(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size_, bool *row_mask, - cudaStream_t cuda_stream); - -int NmsRoundUpPower2(int v); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_NMS_WITH_MASK_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cu deleted file mode 100644 index 1c2fe95ce19..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cu +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "oneslike_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -__global__ void OnesLike(const size_t size, const T* input, T* output) { - int one = 1; - T val = static_cast(one); - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { - output[pos] = val; - } - return; -} -template -void CalOnesLike(const size_t size, const T* input, T* output, cudaStream_t cuda_stream) { - OnesLike<<>>(size, input, output); - return; -} - -template void CalOnesLike(const size_t size, const double* input, double* output, cudaStream_t cuda_stream); -template void CalOnesLike(const size_t size, const float* input, float* output, cudaStream_t cuda_stream); -template void CalOnesLike(const size_t size, const half* input, half* output, cudaStream_t cuda_stream); -template void CalOnesLike(const size_t size, const int8_t* input, int8_t* output, cudaStream_t cuda_stream); -template void CalOnesLike(const size_t size, const int16_t* input, int16_t* output, cudaStream_t cuda_stream); -template void CalOnesLike(const size_t size, const int32_t* input, int32_t* output, cudaStream_t cuda_stream); -template void CalOnesLike(const size_t size, const int64_t* input, int64_t* output, cudaStream_t cuda_stream); -template void CalOnesLike(const size_t size, const uint8_t* input, uint8_t* output, cudaStream_t cuda_stream); -template void CalOnesLike(const size_t size, const uint16_t* input, uint16_t* output, - cudaStream_t cuda_stream); -template void CalOnesLike(const size_t size, const uint32_t* input, uint32_t* output, - cudaStream_t cuda_stream); -template void CalOnesLike(const size_t size, const uint64_t* input, uint64_t* output, - cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cu deleted file mode 100755 index d9df908bfdc..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pack.cu +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include "plugin/device/gpu/kernel/cuda_impl/pack.cuh" -template -__global__ void Pack(const size_t size, const size_t input_num, const size_t dims_behind_axis, T** inputs, T* output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { - size_t cur_input_index = pos / dims_behind_axis % input_num; - size_t cycle_len = input_num * dims_behind_axis; - size_t local_index = pos / cycle_len * dims_behind_axis + pos % cycle_len % dims_behind_axis; - output[pos] = inputs[cur_input_index][local_index]; - } - return; -} - -template -void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, T** inputs, T* output, - cudaStream_t cuda_stream) { - Pack<<>>(size, input_num, dims_behind_axis, inputs, output); - return; -} - - -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, int8_t** inputs, int8_t* output, - cudaStream_t cuda_stream); -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, int16_t** inputs, int16_t* output, - cudaStream_t cuda_stream); -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, int** inputs, int* output, - cudaStream_t cuda_stream); -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, int64_t** inputs, int64_t* output, - cudaStream_t cuda_stream); -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, uint8_t** inputs, uint8_t* output, - cudaStream_t cuda_stream); -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, uint16_t** inputs, uint16_t* output, - cudaStream_t cuda_stream); -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, uint32_t** inputs, uint32_t* output, - cudaStream_t cuda_stream); -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, uint64_t** inputs, uint64_t* output, - cudaStream_t cuda_stream); -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, half** inputs, half* output, - cudaStream_t cuda_stream); -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, float** inputs, float* output, - cudaStream_t cuda_stream); -template void PackKernel(const size_t size, const size_t input_num, - const size_t dims_behind_axis, bool** inputs, bool* output, - cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh deleted file mode 100644 index 647f353013b..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh +++ /dev/null @@ -1,61 +0,0 @@ -/** - * Copyright 2019-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_PAD_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_PAD_IMPL_CUH_ -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void CalPad(const size_t size, const T* input, const int num, const int channels, const int old_height, - const int old_width, const int padded_height, const int padded_width, const int pad_top, const int pad_left, - float pad_value, T* output, cudaStream_t cuda_stream); -template -void CalPadGrad(const size_t size, const T* dy, const int num, const int channels, const int old_height, - const int old_width, const int padded_height, const int padded_width, const int pad_top, - const int pad_left, T* dx, cudaStream_t cuda_stream); -template -void CalPadNHWC(const size_t size, const T* input, const int num, const int old_height, const int old_width, - const int channels, const int padded_height, const int padded_width, const int pad_top, const int pad_left, - float pad_value, T* output, cudaStream_t cuda_stream); -template -void CalPadGradNHWC(const size_t size, const T* input, const int num, const int old_height, const int old_width, - const int channels, const int padded_height, const int padded_width, const int pad_top, - const int pad_left, T* output, cudaStream_t cuda_stream); -template -void CalPadGeneral(const T *input, T *output, const size_t *input_shape, const size_t *strides, - const int *paddings, const int input_size, const size_t input_rank, cudaStream_t cuda_stream); -template -void CalPad3d(const size_t size, const T* input, const int num, const int channels, const int old_depth, - const int old_height, const int old_width, const int padded_depth, const int padded_height, - const int padded_width, const int pad_head, const int pad_top, const int pad_left, const float pad_value, - T* output, cudaStream_t cuda_stream); -template -void CalPadGrad3d(const size_t size, const T* dy, const int num, const int channels, const int old_depth, - const int old_height, const int old_width, const int padded_depth, const int padded_height, - const int padded_width, const int pad_head, const int pad_top, const int pad_left, T* dx, - cudaStream_t cuda_stream); -template -void CalPadNDHWC(const size_t size, const T *input, const int num, const int old_depth, const int old_height, - const int old_width, const int channels, const int padded_depth, const int padded_height, - const int padded_width, const int pad_head, const int pad_top, const int pad_left, - const float pad_value, T *output, cudaStream_t cuda_stream); -template -void CalPadGradNDHWC(const size_t size, const T *dy, const int num, const int old_depth, const int old_height, - const int old_width, const int channels, const int padded_depth, const int padded_height, - const int padded_width, const int pad_head, const int pad_top, const int pad_left, T *dx, - cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_PAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh deleted file mode 100644 index 9ca45fcfd85..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_PS_ROI_POOLING_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_PS_ROI_POOLING_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void PSROIPoolForwardLauncher( - const T* input, const T spatial_scale, const int rois_number, const int feature_height, - const int feature_width, const int feature_channels, const int pooled_height, const int pooled_width, - const T* roi_boxes, const int group_size, const int output_channels, T* output_data, - int* mapping_channel, cudaStream_t stream); - -template -void PSROIPoolBackwardLauncher( - const T* input_diff, const int* mapping_channel, const int batch_size, - const int rois_number, const T spatial_scale, const int feature_channels, - const int feature_height, const int feature_width, const int pooled_width, const int pooled_height, - const int output_channels, T* output_diff, const T* roi_boxes, cudaStream_t stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_PS_ROI_POOLING_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cuh deleted file mode 100644 index 9aaa04e22f0..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_categorical.cuh +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RANDOM_CATEGORICAL_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RANDOM_CATEGORICAL_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void GetCdfKernel(const T *logits_addr, double** dev_cdf, const size_t batch_size, const size_t num_classes, - cudaStream_t cuda_stream); -template -void RandomCategoricalKernel(const size_t num_samples, double** dev_rand, double** dev_cdf, - const size_t batch_size, const size_t num_classes, S *output_addr, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RANDOM_CATEGORICAL_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh deleted file mode 100644 index f5b1cfd74c5..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_ - -#include -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" -#define BLOCKSIZE 256 -#define MAX_DIMENSION 5 - -template -void CalRandomChoiceWithMaskSmall(int input_size, int seedc, int count, K *input, S *output_index, K *output_mask, - cudaStream_t stream); - -template -void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size, const int &d1, const int &d2, - const int &d3, const int &d4, const int &d5, const int &seedc, const int &count, - const T *input, S *output_index, T *output_mask, S *index_buff, S *mask_buff, S *rank_buff, - S *Tnum_buff, S *tmp_buff, curandState *globalState, cudaStream_t stream); - -int RcwmRoundUpPower2(int v); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RANDOM_CHOICE_WITH_MASK_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cuh deleted file mode 100644 index 0b558dc01b6..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/random_op_impl.cuh +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANDOMOPIMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANDOMOPIMPL_H_ - -#include -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void StandardNormal(int seed, int seed2, curandState *globalState, - T *output, size_t count, cudaStream_t cuda_stream); -template -bool UniformInt(int seed, int seed2, curandState *globalState, - T *input1, size_t input_size_1, T *input2, size_t input_size_2, - T *output, size_t count, cudaStream_t cuda_stream); -template -void UniformReal(int seed, int seed2, curandState *globalState, - T *output, size_t count, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANDOMOPIMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cuh deleted file mode 100755 index b6aadafd54d..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cuh +++ /dev/null @@ -1,23 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RealToComplex_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RealToComplex_H_ - -template -void RealToComplex(const size_t size, const T *input, T *output, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RealToComplex_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cu deleted file mode 100644 index c4092a0b88b..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/relu_impl.cu +++ /dev/null @@ -1,97 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void CalReLUKernel(int size, T *input_addr, T *output_addr) { - for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { - output_addr[pos] = input_addr[pos] > static_cast(0) ? input_addr[pos] : static_cast(0); - } -} - -template -void CalReLU(int size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { - CalReLUKernel<<>>(size, input_addr, output_addr); -} - -template void CalReLU(int size, double *input_addr, double *output_addr, cudaStream_t cuda_stream); -template void CalReLU(int size, float *input_addr, float *output_addr, cudaStream_t cuda_stream); -template void CalReLU(int size, half *input_addr, half *output_addr, cudaStream_t cuda_stream); -template void CalReLU(int size, int8_t *input_addr, int8_t *output_addr, cudaStream_t cuda_stream); -template void CalReLU(int size, int16_t *input_addr, int16_t *output_addr, cudaStream_t cuda_stream); -template void CalReLU(int size, int32_t *input_addr, int32_t *output_addr, cudaStream_t cuda_stream); -template void CalReLU(int size, int64_t *input_addr, int64_t *output_addr, cudaStream_t cuda_stream); -template void CalReLU(int size, uint8_t *input_addr, uint8_t *output_addr, cudaStream_t cuda_stream); - -template -__global__ void ReluV2Kernel(const size_t num, const T *x, T *y, uint32_t *mask) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x) { - T v = x[i]; - bool p = v > static_cast(0); - y[i] = p ? v : static_cast(0); - - auto warp_predict = BallotSync(p, __activemask()); - if (LaneId() == 0) { - mask[WarpId(i)] = warp_predict; - } - } -} - -template -void ReluV2(const size_t num, const T *x, T *y, uint32_t *mask, cudaStream_t cuda_stream) { - ReluV2Kernel<<>>(num, x, y, mask); -} - -template -__global__ void ReluGradV2Kernel(const size_t num, const T *dy, const uint32_t *mask, T *dx) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x) { - bool p = mask[WarpId(i)] & (1 << LaneId()); - dx[i] = p ? dy[i] : static_cast(0); - } -} - -template -void ReluGradV2(const size_t num, const T *dy, const uint32_t *mask, T *dx, cudaStream_t cuda_stream) { - ReluGradV2Kernel<<>>(num, dy, mask, dx); -} - -template void ReluV2(const size_t num, const double *x, double *y, uint32_t *mask, cudaStream_t cuda_stream); -template void ReluV2(const size_t num, const float *x, float *y, uint32_t *mask, cudaStream_t cuda_stream); -template void ReluV2(const size_t num, const half *x, half *y, uint32_t *mask, cudaStream_t cuda_stream); -template void ReluV2(const size_t num, const int8_t *x, int8_t *y, uint32_t *mask, cudaStream_t cuda_stream); -template void ReluV2(const size_t num, const int16_t *x, int16_t *y, uint32_t *mask, cudaStream_t cuda_stream); -template void ReluV2(const size_t num, const int32_t *x, int32_t *y, uint32_t *mask, cudaStream_t cuda_stream); -template void ReluV2(const size_t num, const int64_t *x, int64_t *y, uint32_t *mask, cudaStream_t cuda_stream); -template void ReluV2(const size_t num, const uint8_t *x, uint8_t *y, uint32_t *mask, cudaStream_t cuda_stream); - -template void ReluGradV2(const size_t num, const double *dy, const uint32_t *mask, double *dx, - cudaStream_t cuda_stream); -template void ReluGradV2(const size_t num, const float *dy, const uint32_t *mask, float *dx, cudaStream_t cuda_stream); -template void ReluGradV2(const size_t num, const half *dy, const uint32_t *mask, half *dx, cudaStream_t cuda_stream); -template void ReluGradV2(const size_t num, const int8_t *dy, const uint32_t *mask, int8_t *dx, - cudaStream_t cuda_stream); -template void ReluGradV2(const size_t num, const int16_t *dy, const uint32_t *mask, int16_t *dx, - cudaStream_t cuda_stream); -template void ReluGradV2(const size_t num, const int32_t *dy, const uint32_t *mask, int32_t *dx, - cudaStream_t cuda_stream); -template void ReluGradV2(const size_t num, const int64_t *dy, const uint32_t *mask, int64_t *dx, - cudaStream_t cuda_stream); -template void ReluGradV2(const size_t num, const uint8_t *dy, const uint32_t *mask, uint8_t *dx, - cudaStream_t cuda_stream); - diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh deleted file mode 100644 index 4d2a4350232..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RESIZE_BILINEAR_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RESIZE_BILINEAR_H_ -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void CalResizeBilinear(const T *input, const int n_, const int c_, const int input_h_, const int input_w_, - const int output_h_, const int output_w_, const float h_scale, const float w_scale, T *output, - cudaStream_t cuda_stream); -void CalResizeBilinearGrad(const half *input, const int n_, const int c_, const int input_h_, const int input_w_, - const int output_h_, const int output_w_, const float h_scale, const float w_scale, half *output, float *interim, - cudaStream_t cuda_stream); -void CalResizeBilinearGrad(const float *input, const int n_, const int c_, const int input_h_, const int input_w_, - const int output_h_, const int output_w_, const float h_scale, const float w_scale, float *output, float *interim, - cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RESIZE_BILINEAR_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh deleted file mode 100644 index 926b0215f6e..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_ -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" -#define RESIZENEARESTNEIGHBORGRAD_DIMENSION 4 - -template -void CalResizeNearestNeighborGrad(const int input_size, const T *input, const int s1, const int s2, const int s3, - const int s4, T *output, const int d1, const int d2, const int d3, const int d4, - bool align_corners, float h_scale, float w_scale, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_GRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh deleted file mode 100644 index fc68a67e117..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_ -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" -#define RESIZENEARESTNEIGHBOR_DIMENSION 4 - -template -void CalResizeNearestNeighbor(const int size, const T *input, const int s1, const int s2, const int s3, const int s4, - T *output, const int d1, const int d2, const int d3, const int d4, bool align_corners, - float h_scale, float w_scale, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_RESIZE_NEAREST_NEIGHBOR_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cu deleted file mode 100644 index 52b98e8c6e0..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cu +++ /dev/null @@ -1,159 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include "plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh" - -// Util function to convert a 1D input array index to an N-D positional index -// Required since GPU iterates over all values in an ND array as a 1D array -__inline__ __device__ void IdxToPos(size_t idx, size_t *pos, size_t cur_thread_idx, size_t *cum_shape, - size_t shape_size) { - size_t rem_val = idx; - for (int i = 0; i < shape_size; i++) { - pos[cur_thread_idx + i] = rem_val / cum_shape[i]; - rem_val = rem_val % cum_shape[i]; - } - return; -} - -// Util function to convert a N-D positonal index to a 1D index -__inline__ __device__ size_t PosToIdx(size_t *pos, size_t cur_thread_idx, size_t *cum_shape, size_t shape_size) { - size_t idx = 0; - for (int i = 0; i < shape_size; i++) { - idx = idx + (pos[cur_thread_idx + i] * cum_shape[i]); - } - return idx; -} - -// CumShape takes Shape: (2,2,5) => cumShape (10,5,1) which informs how many values -// each dimension will represent. Required for converting 1d index to positional vector. -// In this example 10 in dim 0 means, an increase of 1 in this dim leads to another 10 values -// in the overall array -__global__ void ComputeCumShape(const size_t *input_shape_ptr, size_t *input_shape_cum_ptr, size_t shape_size) { - int cur_val = 1; - for (int i = shape_size - 1; i >= 0; i--) { - // iterate list in reverse and cummulatively build shape - input_shape_cum_ptr[i] = cur_val; - cur_val = cur_val * input_shape_ptr[i]; - } - return; -} -template -__global__ void ReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim, - const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr, - size_t *input_shape_cum_ptr, size_t shape_size, T *output) { - // calculate which thread this is out of total across all blocks for accessing respective cur_pos_arr memory - size_t cur_thread_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - cur_thread_idx = cur_thread_idx * shape_size; - size_t cur_slice = 0; // current slice as split by the batch_dim - size_t cur_slice_seq_len = 0; // reverse seq length for this slice as provided by user - size_t new_idx = 0; // calculate corresponding reverse element from input - for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { - IdxToPos(idx, cur_pos_arr, cur_thread_idx, input_shape_cum_ptr, shape_size); - cur_slice = cur_pos_arr[cur_thread_idx + batch_dim]; // all accesses to cur_pos_arr have to be adjusted per thread - cur_slice_seq_len = seq_len[cur_slice]; - if (cur_slice_seq_len == 0) { // adjust length to 1 if 0 provided, same result in both cases - cur_slice_seq_len = 1; - } - if (cur_pos_arr[cur_thread_idx + seq_dim] > (cur_slice_seq_len - 1)) { // check if within range - // copy value directly and continue - outside of reversal range - output[idx] = input[idx]; - continue; - } - // find corresponding reverse element in input - cur_pos_arr[cur_thread_idx + seq_dim] = - (cur_slice_seq_len - 1) - cur_pos_arr[cur_thread_idx + seq_dim]; // adjust position to target - new_idx = PosToIdx(cur_pos_arr, cur_thread_idx, input_shape_cum_ptr, shape_size); // get the updated index - output[idx] = input[new_idx]; - } - return; -} - -template -void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim, - const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr, - size_t *input_shape_cum_ptr, size_t shape_size, T *output, cudaStream_t cuda_stream) { - ComputeCumShape<<<1, 1, 0, cuda_stream>>>(input_shape_ptr, input_shape_cum_ptr, shape_size); - ReverseSequence<<>>( - size, input, seq_len, batch_dim, seq_dim, cur_pos_arr, input_shape_ptr, input_shape_cum_ptr, shape_size, output); - return; -} - -template void CalReverseSequence(const size_t size, const int8_t *input, const int *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, int8_t *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const int8_t *input, const int64_t *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, int8_t *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const int16_t *input, const int *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, int16_t *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const int16_t *input, const int64_t *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, int16_t *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const int *input, const int *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, int *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const int *input, const int64_t *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, int *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const int64_t *input, const int *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, int64_t *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const int64_t *input, const int64_t *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, int64_t *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const half *input, const int *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, half *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const half *input, const int64_t *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, half *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const float *input, const int *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, float *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const float *input, const int64_t *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, float *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const double *input, const int *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, double *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const double *input, const int64_t *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, double *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const bool *input, const int *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, bool *output, cudaStream_t cuda_stream); -template void CalReverseSequence(const size_t size, const bool *input, const int64_t *seq_len, - const int64_t batch_dim, const int64_t seq_dim, size_t *cur_pos_arr, - const size_t *input_shape_ptr, size_t *intput_shape_cum_ptr, - size_t shape_size, bool *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh deleted file mode 100644 index 57b940bce21..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_SEQUENCE_IMPL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_SEQUENCE_IMPL_H_ -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void CalReverseSequence(const size_t size, const T *input, const S *seq_len, const int64_t batch_dim, - const int64_t seq_dim, size_t *cur_pos_arr, const size_t *input_shape_ptr, - size_t *intput_shape_cum_ptr, size_t shape_size, T *output, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_REVERSE_SEQUENCE_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cu deleted file mode 100644 index 81a9e0aa56b..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cu +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include "reverse_v2_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -__global__ void ReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides, - const int64_t* axis, size_t input_size, size_t axis_size) { - for (int64_t gt_id = blockIdx.x * blockDim.x + threadIdx.x; gt_id < input_size; gt_id += blockDim.x * gridDim.x) { - int64_t intermediate_index = gt_id; - for (size_t i = 0; i < axis_size; i++) { - int64_t d = axis[i]; - int64_t pre_reverse_position = (gt_id / strides[d]) % input_shape[d]; - int64_t reversed_position = input_shape[d] - pre_reverse_position - 1; - intermediate_index += ((reversed_position - pre_reverse_position) * strides[d]); - } - - output[intermediate_index] = input[gt_id]; - } - return; -} -template -void CalReverseV2(const T* input, T* output, const size_t* input_shape, const int64_t* strides, const int64_t* axis, - size_t input_size, size_t axis_size, cudaStream_t cuda_stream) { - ReverseV2<<>>(input, output, input_shape, strides, axis, - input_size, axis_size); - return; -} - -template void CalReverseV2(const half* input, half* output, const size_t* input_shape, const int64_t* strides, - const int64_t* axis, size_t input_size, size_t axis_size, cudaStream_t cuda_stream); - -template void CalReverseV2(const float* input, float* output, const size_t* input_shape, const int64_t* strides, - const int64_t* axis, size_t input_size, size_t axis_size, cudaStream_t cuda_stream); - -template void CalReverseV2(const uint8_t* input, uint8_t* output, const size_t* input_shape, - const int64_t* strides, const int64_t* axis, size_t input_size, size_t axis_size, - cudaStream_t cuda_stream); - -template void CalReverseV2(const int16_t* input, int16_t* output, const size_t* input_shape, - const int64_t* strides, const int64_t* axis, size_t input_size, size_t axis_size, - cudaStream_t cuda_stream); - -template void CalReverseV2(const int32_t* input, int32_t* output, const size_t* input_shape, - const int64_t* strides, const int64_t* axis, size_t input_size, size_t axis_size, - cudaStream_t cuda_stream); - -template void CalReverseV2(const int64_t* input, int64_t* output, const size_t* input_shape, - const int64_t* strides, const int64_t* axis, size_t input_size, size_t axis_size, - cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh index 07248a8a7bb..25688b4154a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh @@ -17,7 +17,7 @@ #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RL_BUFFER_IMPL_H_ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RL_BUFFER_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void BufferAppend(const int64_t capacity, const size_t size, const int *index, const int exp_batch, unsigned char *buffer, const unsigned char *exp, cudaStream_t cuda_stream); void IncreaseCount(const int64_t capacity, const int exp_batch, int *count, int *head, int *index, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cuh deleted file mode 100644 index 40a63ce48de..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cuh +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RMSPROP_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RMSPROP_H_ -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void RmsProp(const T* learning_rate, const T decay, const T momentum, const T epsilon, T* variable, T* mean_square, - T* moment, T* gradients, const size_t size, cudaStream_t cuda_stream); - -template -void RmsPropCenter(const T* learning_rate, const T* decay, const T* momentum, const T* epsilon, T* variable, - T* mean_gradients, T* mean_square, T* moment, T* gradients, const size_t size, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RMSPROP_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cuh deleted file mode 100644 index 5a71e88d273..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cuh +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ROI_ALIGN_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ROI_ALIGN_IMPL_H_ -template -void ROIAlign(const T *x, const T *roi_boxes, int roi_rows, int roi_cols, T *out_data, const T spatial_scale, - const int sample_num, int roi_end_mode, const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, cudaStream_t cuda_stream); - -template -void ROIAlignGrad(const T *dy, const T *roi_boxes, int batch_size, int roi_rows, int roi_cols, T *dx, - const T spatial_scale, const int sample_num, int roi_end_mode, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_ROI_ALIGN_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cu deleted file mode 100644 index efc0a21e836..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cu +++ /dev/null @@ -1,103 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh" - -template -__global__ void ScatterUpdateKernel(const size_t inner_size, const size_t updates_size, const S *indices, - const T *updates, T *input) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) { - const size_t index = pos / inner_size; - const size_t offset = pos % inner_size; - const size_t current_pos = indices[index] * inner_size + offset; - input[current_pos] = updates[pos]; - } -} - -template -__global__ void ScatterAddKernel(const size_t inner_size, const size_t updates_size, const S *indices, const T *updates, - T *input) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) { - const size_t index = pos / inner_size; - const size_t offset = pos % inner_size; - const size_t current_pos = indices[index] * inner_size + offset; - MsAtomicAdd(&input[current_pos], updates[pos]); - } -} - -template -__global__ void ScatterSubKernel(const size_t inner_size, const size_t updates_size, const S *indices, const T *updates, - T *input) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) { - const size_t index = pos / inner_size; - const size_t offset = pos % inner_size; - const size_t current_pos = indices[index] * inner_size + offset; - MsAtomicAdd(&input[current_pos], -updates[pos]); - } -} - -template -void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, const size_t &indices_size, - const S *indices, const T *updates, T *input, cudaStream_t cuda_stream) { - const size_t updates_size = inner_size * indices_size; - switch (func_type) { - case SCATTER_FUNC_UPDATE: - return ScatterUpdateKernel<<>>(inner_size, updates_size, - indices, updates, input); - case SCATTER_FUNC_ADD: - return ScatterAddKernel<<>>(inner_size, updates_size, - indices, updates, input); - case SCATTER_FUNC_SUB: - return ScatterSubKernel<<>>(inner_size, updates_size, - indices, updates, input); - default: - break; - } -} - -template void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, - const size_t &indices_size, const int *indices, const float *updates, - float *input, cudaStream_t cuda_stream); -template void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, - const size_t &indices_size, const int64_t *indices, const float *updates, - float *input, cudaStream_t cuda_stream); -template void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, - const size_t &indices_size, const int *indices, const half *updates, half *input, - cudaStream_t cuda_stream); -template void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, - const size_t &indices_size, const int64_t *indices, const half *updates, - half *input, cudaStream_t cuda_stream); -template void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, - const size_t &indices_size, const int *indices, const int *updates, int *input, - cudaStream_t cuda_stream); -template void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, - const size_t &indices_size, const int64_t *indices, const int *updates, - int *input, cudaStream_t cuda_stream); -template void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, - const size_t &indices_size, const int *indices, - const unsigned char *updates, unsigned char *input, - cudaStream_t cuda_stream); -template void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, - const size_t &indices_size, const int64_t *indices, - const unsigned char *updates, unsigned char *input, - cudaStream_t cuda_stream); -template void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, - const size_t &indices_size, const int *indices, const int8_t *updates, - int8_t *input, cudaStream_t cuda_stream); -template void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, - const size_t &indices_size, const int64_t *indices, const int8_t *updates, - int8_t *input, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cu deleted file mode 100644 index 1780e97447c..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cu +++ /dev/null @@ -1,110 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void ScatterNdKernel(S *indices, T *update, T *output, const size_t block_size, const size_t input_size, - const size_t output_size, const size_t indices_dim_0, const size_t indices_dim_1, - S *indices_stride, S *work_shape) { - int i, j; - for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; - read_index += blockDim.x * gridDim.x) { - size_t write_index = 0; - bool out_bound = false; - - i = read_index / block_size; - j = read_index % block_size; - - for (size_t k = 0; k < indices_dim_1; k++) { - S indices_i = indices[i * indices_dim_1 + k]; - out_bound |= indices_i >= work_shape[k]; - write_index += indices_i * indices_stride[k]; - } - - write_index += j; - out_bound |= write_index >= output_size; - - if (!out_bound) { - MsAtomicAdd(&output[write_index], update[read_index]); - } - } -} - -template -void ScatterNd(S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride, - S *work_shape, cudaStream_t stream) { - ScatterNdKernel<<>>(indices, update, output, block_size, input_size, - output_size, indices_dim_0, indices_dim_1, - indices_stride, work_shape); - return; -} - -template void ScatterNd(int *indices, double *update, double *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void ScatterNd(int64_t *indices, double *update, double *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int64_t *indices_stride, int64_t *work_shape, cudaStream_t stream); -template void ScatterNd(int *indices, float *update, float *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void ScatterNd(int64_t *indices, float *update, float *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int64_t *indices_stride, int64_t *work_shape, cudaStream_t stream); -template void ScatterNd(int *indices, half *update, half *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void ScatterNd(int64_t *indices, half *update, half *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); -template void ScatterNd(int *indices, int *update, int *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void ScatterNd(int64_t *indices, int *update, int *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); -// NOLINTNEXTLINE -template void ScatterNd(int *indices, short *update, short *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -// NOLINTNEXTLINE -template void ScatterNd(int64_t *indices, short *update, short *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int64_t *indices_stride, int64_t *work_shape, cudaStream_t stream); -template void ScatterNd(int *indices, unsigned char *update, unsigned char *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void ScatterNd(int64_t *indices, unsigned char *update, unsigned char *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, - int64_t *work_shape, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh deleted file mode 100644 index 8bf142abaf9..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_SCATTER_ND_GPU_CU_H -#define MINDSPORE_SCATTER_ND_GPU_CU_H - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void ScatterNd(S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride, - S *work_shape, cudaStream_t stream); -#endif // MINDSPORE_SCATTER_ND_GPU_CU_H diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cu deleted file mode 100644 index 8aca3518dd2..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cu +++ /dev/null @@ -1,181 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh" - -template -__global__ void ScatterNdUpdate(const size_t unit_size, const size_t index_depth, const size_t updates_size, - const S *out_strides, const S *indices, const T *updates, T *input) { - int i, j; - for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size); - read_index += blockDim.x * gridDim.x) { - size_t write_index = 0; - bool out_bound = false; - - i = read_index / unit_size; - j = read_index % unit_size; - - for (size_t k = 0; k < index_depth; k++) { - S indices_i = indices[i * index_depth + k]; - out_bound |= indices_i < 0; - write_index += indices_i * out_strides[k] * unit_size; - } - - write_index += j; - - if (!out_bound) { - input[write_index] = updates[read_index]; - } - } -} - -template -__global__ void ScatterNdAdd(const size_t unit_size, const size_t index_depth, const size_t updates_size, - const S *out_strides, const S *indices, const T *updates, T *input) { - int i, j; - for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size); - read_index += blockDim.x * gridDim.x) { - size_t write_index = 0; - bool out_bound = false; - - i = read_index / unit_size; - j = read_index % unit_size; - - for (size_t k = 0; k < index_depth; k++) { - S indices_i = indices[i * index_depth + k]; - out_bound |= indices_i < 0; - write_index += indices_i * out_strides[k] * unit_size; - } - - write_index += j; - - if (!out_bound) { - MsAtomicAdd(&input[write_index], updates[read_index]); - } - } -} - -template -__global__ void ScatterNdSub(const size_t unit_size, const size_t index_depth, const size_t updates_size, - const S *out_strides, const S *indices, const T *updates, T *input) { - int i, j; - for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size); - read_index += blockDim.x * gridDim.x) { - size_t write_index = 0; - bool out_bound = false; - - i = read_index / unit_size; - j = read_index % unit_size; - - for (size_t k = 0; k < index_depth; k++) { - S indices_i = indices[i * index_depth + k]; - out_bound |= indices_i < 0; - write_index += indices_i * out_strides[k] * unit_size; - } - - write_index += j; - - if (!out_bound) { - MsAtomicAdd(&input[write_index], -updates[read_index]); - } - } -} - -template -void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, const size_t &num_units, - const size_t &index_depth, const S *out_strides, const S *indices, const T *updates, T *input, - cudaStream_t cuda_stream) { - const size_t updates_size = unit_size * num_units; - switch (func_type) { - case SCATTER_ND_FUNC_UPDATE: - return ScatterNdUpdate<<>>( - unit_size, index_depth, updates_size, out_strides, indices, updates, input); - case SCATTER_ND_FUNC_ADD: - return ScatterNdAdd<<>>( - unit_size, index_depth, updates_size, out_strides, indices, updates, input); - case SCATTER_ND_FUNC_SUB: - return ScatterNdSub<<>>( - unit_size, index_depth, updates_size, out_strides, indices, updates, input); - default: - break; - } -} - -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int64_t *out_strides, const int64_t *indices, - const double *updates, double *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int32_t *out_strides, const int32_t *indices, - const double *updates, double *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int64_t *out_strides, const int64_t *indices, - const float *updates, float *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int32_t *out_strides, const int32_t *indices, - const float *updates, float *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int64_t *out_strides, const int64_t *indices, - const half *updates, half *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int32_t *out_strides, const int32_t *indices, - const half *updates, half *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int64_t *out_strides, const int64_t *indices, - const int32_t *updates, int32_t *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int32_t *out_strides, const int32_t *indices, - const int32_t *updates, int32_t *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int64_t *out_strides, const int64_t *indices, - const int16_t *updates, int16_t *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int32_t *out_strides, const int32_t *indices, - const int16_t *updates, int16_t *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int64_t *out_strides, const int64_t *indices, - const uint8_t *updates, uint8_t *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int32_t *out_strides, const int32_t *indices, - const uint8_t *updates, uint8_t *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int64_t *out_strides, const int64_t *indices, - const int8_t *updates, int8_t *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int32_t *out_strides, const int32_t *indices, - const int8_t *updates, int8_t *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int64_t *out_strides, const int64_t *indices, - const bool *updates, bool *input, cudaStream_t cuda_stream); -template void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, - const size_t &num_units, const size_t &index_depth, - const int32_t *out_strides, const int32_t *indices, - const bool *updates, bool *input, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cu deleted file mode 100644 index 4f11a9682ea..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/select_impl.cu +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include "plugin/device/gpu/kernel/cuda_impl/select_impl.cuh" - -template -__global__ void Select(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { - output[pos] = cond[pos] ? input_x[pos] : input_y[pos]; - } - return; -} - -template -void CalSelect(const size_t size, const bool* cond, const T* input_x, const T* input_y, T* output, - cudaStream_t cuda_stream) { - Select<<>>(size, cond, input_x, input_y, output); - return; -} - -template void CalSelect(const size_t size, const bool* cond, const double* input_X, const double* input_y, - double* output, cudaStream_t cuda_stream); -template void CalSelect(const size_t size, const bool* cond, const float* input_X, const float* input_y, - float* output, cudaStream_t cuda_stream); -template void CalSelect(const size_t size, const bool* cond, const int* input_X, const int* input_y, int* output, - cudaStream_t cuda_stream); -template void CalSelect(const size_t size, const bool* cond, const half* input_X, const half* input_y, - half* output, cudaStream_t cuda_stream); -template void CalSelect(const size_t size, const bool* cond, const int64_t* input_X, const int64_t* input_y, - int64_t* output, cudaStream_t cuda_stream); -template void CalSelect(const size_t size, const bool *cond, const bool *input_X, const bool *input_y, - bool *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cu deleted file mode 100644 index f63cf62786d..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cu +++ /dev/null @@ -1,132 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include "plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh" - -namespace { -constexpr size_t kMaxDim = 8; -} - -template -class VectorWrapper { - public: - explicit VectorWrapper(const std::vector &v) { std::copy(v.begin(), v.end(), data); } - ~VectorWrapper() {} - __device__ T& operator[](size_t index) { return data[index]; } - - private: - T data[N]; -}; - -template -__global__ void CopySlicesKernel(VectorWrapper begins, VectorWrapper stride, - VectorWrapper u, VectorWrapper u_offset, - VectorWrapper o_offset, const T *update_addr, T *output_addr) { - size_t update_num = u[0] * u_offset[0]; - - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < update_num; pos += blockDim.x * gridDim.x) { - size_t i = pos / (u_offset[0]) % u[0]; - size_t j = pos / (u_offset[1]) % u[1]; - size_t k = pos / (u_offset[2]) % u[2]; - size_t l = pos / (u_offset[3]) % u[3]; - size_t m = pos / (u_offset[4]) % u[4]; - size_t n = pos / (u_offset[5]) % u[5]; - size_t o = pos / (u[7]) % u[6]; - size_t p = pos % u[7]; - - size_t output_idx = (i * stride[0] + begins[0]) * o_offset[0] + (j * stride[1] + begins[1]) * o_offset[1] + - (k * stride[2] + begins[2]) * o_offset[2] + (l * stride[3] + begins[3]) * o_offset[3] + - (m * stride[4] + begins[4]) * o_offset[4] + (n * stride[5] + begins[5]) * o_offset[5] + - (o * stride[6] + begins[6]) * o_offset[6] + (p * stride[7] + begins[7]); - output_addr[output_idx] = update_addr[pos]; - } -} - -std::vector CalculateOffset(const std::vector &shape) { - std::vector offset(kMaxDim); - offset[7] = 1; - offset[6] = offset[7] * shape[7]; - offset[5] = offset[6] * shape[6]; - offset[4] = offset[5] * shape[5]; - offset[3] = offset[4] * shape[4]; - offset[2] = offset[3] * shape[3]; - offset[1] = offset[2] * shape[2]; - offset[0] = offset[1] * shape[1]; - return offset; -} - -template -void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, const T *update, T *output, - cudaStream_t cuda_stream) { - size_t size = std::accumulate(update_shape.begin(), update_shape.end(), 1, std::multiplies()); - - VectorWrapper o_offset(CalculateOffset(output_shape)); - VectorWrapper u_offset(CalculateOffset(update_shape)); - - VectorWrapper begins(begin); - VectorWrapper strides(stride); - VectorWrapper update_shapes(update_shape); - - CopySlicesKernel<<>>(begins, strides, update_shapes, u_offset, - o_offset, update, output); -} - -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const bool *update, bool *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const double *update, double *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const float *update, float *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const half *update, half *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const int64_t *update, int64_t *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, const int *update, - int *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const short *update, short *output, cudaStream_t cuda_stream); // NOLINT -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const int8_t *update, int8_t *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const uint64_t *update, uint64_t *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const uint32_t *update, uint32_t *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const uint16_t *update, uint16_t *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const unsigned char *update, unsigned char *output, cudaStream_t cuda_stream); -template void CopySlices(const std::vector &update_shape, const std::vector &begin, - const std::vector &stride, const std::vector &output_shape, - const char *update, char *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cu deleted file mode 100644 index f9b2e1dabb9..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cu +++ /dev/null @@ -1,618 +0,0 @@ -/** - * Copyright 2019-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh" - -template -__global__ void Slice1D(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1; pos += blockDim.x * gridDim.x) { - output[pos] = input[pos + s1]; - } -} - -template -__global__ void Slice2D(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, - const size_t d2, const T *input, T *output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2; pos += blockDim.x * gridDim.x) { - size_t i = pos / l2 % l1; - size_t j = pos % l2; - - size_t offset = (i + s1) * d2 + (j + s2); - output[pos] = input[offset]; - } -} - -template -__global__ void Slice3D(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, - const size_t l3, const size_t d1, const size_t d2, const size_t d3, const T *input, T *output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3; pos += blockDim.x * gridDim.x) { - size_t i = pos / (l2 * l3) % l1; - size_t j = pos / l3 % l2; - size_t k = pos % l3; - - size_t offset = (i + s1) * (d2 * d3) + (j + s2) * d3 + (k + s3); - output[pos] = input[offset]; - } -} - -template -__global__ void Slice4D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const T *input, T *output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4; pos += blockDim.x * gridDim.x) { - size_t i = pos / (l2 * l3 * l4) % l1; - size_t j = pos / (l3 * l4) % l2; - size_t k = pos / l4 % l3; - size_t o = pos % l4; - - size_t offset = (i + s1) * (d2 * d3 * d4) + (j + s2) * (d3 * d4) + (k + s3) * d4 + (o + s4); - output[pos] = input[offset]; - } -} - -template -__global__ void Slice5D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, - const T *input, T *output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5; - pos += blockDim.x * gridDim.x) { - size_t i = pos / (l2 * l3 * l4 * l5) % l1; - size_t j = pos / (l3 * l4 * l5) % l2; - size_t k = pos / (l4 * l5) % l3; - size_t o = pos / l5 % l4; - size_t q = pos % l5; - - size_t offset = - (i + s1) * (d2 * d3 * d4 * d5) + (j + s2) * (d3 * d4 * d5) + (k + s3) * (d4 * d5) + (o + s4) * d5 + (q + s5); - output[pos] = input[offset]; - } -} - -template -__global__ void Slice6D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, - const size_t d4, const size_t d5, const size_t d6, const T *input, T *output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5 * l6; - pos += blockDim.x * gridDim.x) { - size_t i = pos / (l2 * l3 * l4 * l5 * l6) % l1; - size_t j = pos / (l3 * l4 * l5 * l6) % l2; - size_t k = pos / (l4 * l5 * l6) % l3; - size_t o = pos / (l5 * l6) % l4; - size_t q = pos / l6 % l5; - size_t r = pos % l6; - - size_t offset = - (i + s1) * (d2 * d3 * d4 * d5 * d6) + (j + s2) * (d3 * d4 * d5 * d6) + (k + s3) * (d4 * d5 * d6) + (o + s4) * - (d5 * d6) + (q + s5) * d6 + (r + s6); - output[pos] = input[offset]; - } -} - -template -__global__ void Slice7D(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, - const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, - const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const size_t d7, const T *input, T *output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < l1 * l2 * l3 * l4 * l5 * l6 * l7; - pos += blockDim.x * gridDim.x) { - size_t i = pos / (l2 * l3 * l4 * l5 * l6 * l7) % l1; - size_t j = pos / (l3 * l4 * l5 * l6 * l7) % l2; - size_t k = pos / (l4 * l5 * l6 * l7) % l3; - size_t o = pos / (l5 * l6 * l7) % l4; - size_t q = pos / (l6 * l7) % l5; - size_t r = pos / l7 % l6; - size_t s = pos % l7; - - size_t offset = - (i + s1) * (d2 * d3 * d4 * d5 * d6 * d7) + (j + s2) * (d3 * d4 * d5 * d6 * d7) + (k + s3) * (d4 * d5 * d6 * d7)+ - (o + s4) * (d5 * d6 * d7) + (q + s5) * (d6 * d7) + (r + s6) * d7 + (s + s7); - output[pos] = input[offset]; - } -} - -template -__global__ void Slice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, - const T *dy, T *dx) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (l1 * l2 * l3 * l4); pos += blockDim.x * gridDim.x) { - size_t i = pos / (l2 * l3 * l4) % l1; - size_t j = pos / (l3 * l4) % l2; - size_t k = pos / l4 % l3; - size_t o = pos % l4; - size_t input_idx = (i + s1) * (d2 * d3 * d4) + (j + s2) * (d3 * d4) + (k + s3) * d4 + (o + s4); - dx[input_idx] = dy[pos]; - } -} - -template -__global__ void FillArray(T *addr, const size_t len, const float value) { - T value_ = static_cast(value); - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < len; pos += blockDim.x * gridDim.x) { - addr[pos] = value_; - } - return; -} -template -void FillDeviceArray(const size_t input_size, T *addr, const float value, cudaStream_t cuda_stream) { - FillArray<<>>(addr, input_size, value); - return; -} -template -void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output, cudaStream_t stream) { - Slice1D<<>>(s1, l1, d1, input, output); -} -template -void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, const size_t d2, - const T *input, T *output, cudaStream_t stream) { - Slice2D<<>>(s1, s2, l1, l2, d1, d2, input, output); -} -template -void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, const size_t l3, - const size_t d1, const size_t d2, const size_t d3, const T *input, T *output, cudaStream_t stream) { - Slice3D<<>>(s1, s2, s3, l1, l2, l3, d1, d2, d3, input, output); -} -template -void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, const size_t l2, - const size_t l3, const size_t l4, const size_t d1, const size_t d2, const size_t d3, const size_t d4, - const T *input, T *output, cudaStream_t stream) { - Slice4D<<>>(s1, s2, s3, s4, l1, l2, l3, l4, d1, d2, d3, d4, - input, output); -} -template -void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const size_t d5, const T *input, T *output, cudaStream_t stream) { - Slice5D<<>>(s1, s2, s3, s4, s5, l1, l2, l3, l4, l5, d1, - d2, d3, d4, d5, input, output); -} -template -void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t l6, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const T *input, T *output, cudaStream_t stream) { - Slice6D<<>>(s1, s2, s3, s4, s5, s6, l1, l2, l3, l4, - l5, l6, d1, d2, d3, d4, d5, d6, input, - output); -} -template -void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6, - const size_t s7, const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t l6, const size_t l7, const size_t d1, const size_t d2, const size_t d3, const size_t d4, - const size_t d5, const size_t d6, const size_t d7, const T *input, T *output, cudaStream_t stream) { - Slice7D<<>>(s1, s2, s3, s4, s5, s6, s7, l1, l2, - l3, l4, l5, l6, l7, d1, d2, d3, d4, - d5, d6, d7, input, output); -} -template -void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const T *dy, T *dx, cudaStream_t stream) { - Slice4DGrad<<>>(s1, s2, s3, s4, l1, l2, l3, l4, d1, d2, d3, d4, - dy, dx); -} - -template -__global__ void StridedSliceKernel(const size_t b0, const size_t b1, const size_t b2, const size_t b3, const size_t b4, - const size_t b5, const size_t b6, const size_t s0, const size_t s1, const size_t s2, - const size_t s3, const size_t s4, const size_t s5, const size_t s6, const size_t i0, - const size_t i1, const size_t i2, const size_t i3, const size_t i4, const size_t i5, - const size_t i6, const size_t o0, const size_t o1, const size_t o2, const size_t o3, - const size_t o4, const size_t o5, const size_t o6, const T *input_addr, - T *output_addr) { - size_t output_num = o0 * o1 * o2 * o3 * o4 * o5 * o6; - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) { - size_t i = pos / (o1 * o2 * o3 * o4 * o5 * o6) % o0; - size_t j = pos / (o2 * o3 * o4 * o5 * o6) % o1; - size_t k = pos / (o3 * o4 * o5 * o6) % o2; - size_t l = pos / (o4 * o5 * o6) % o3; - size_t m = pos / (o5 * o6) % o4; - size_t n = pos / (o6) % o5; - size_t o = pos % o6; - - size_t input_idx = (i * s0 + b0) * i1 * i2 * i3 * i4 * i5 * i6 + (j * s1 + b1) * i2 * i3 * i4 * i5 * i6 + - (k * s2 + b2) * i3 * i4 * i5 * i6 + (l * s3 + b3) * i4 * i5 * i6 + (m * s4 + b4) * i5 * i6 + - (n * s5 + b5) * i6 + (o * s6 + b6); - output_addr[pos] = input_addr[input_idx]; - } -} - -template -void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, const T *input, - T *output, cudaStream_t cuda_stream) { - size_t size = output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3] * output_shape[4] * - output_shape[5] * output_shape[6]; - StridedSliceKernel<<>>( - begin[0], begin[1], begin[2], begin[3], begin[4], begin[5], begin[6], strides[0], strides[1], strides[2], - strides[3], strides[4], strides[5], strides[6], input_shape[0], input_shape[1], input_shape[2], input_shape[3], - input_shape[4], input_shape[5], input_shape[6], output_shape[0], output_shape[1], output_shape[2], output_shape[3], - output_shape[4], output_shape[5], output_shape[6], input, output); -} - -template -__global__ void StridedSliceGradKernel(const size_t b0, const size_t b1, const size_t b2, const size_t b3, - const size_t b4, const size_t b5, const size_t b6, const size_t s0, - const size_t s1, const size_t s2, const size_t s3, const size_t s4, - const size_t s5, const size_t s6, const size_t i0, const size_t i1, - const size_t i2, const size_t i3, const size_t i4, const size_t i5, - const size_t i6, const size_t o0, const size_t o1, const size_t o2, - const size_t o3, const size_t o4, const size_t o5, const size_t o6, const T *dy, - T *dx) { - size_t output_num = o0 * o1 * o2 * o3 * o4 * o5 * o6; - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) { - size_t i = pos / (o1 * o2 * o3 * o4 * o5 * o6) % o0; - size_t j = pos / (o2 * o3 * o4 * o5 * o6) % o1; - size_t k = pos / (o3 * o4 * o5 * o6) % o2; - size_t l = pos / (o4 * o5 * o6) % o3; - size_t m = pos / (o5 * o6) % o4; - size_t n = pos / (o6) % o5; - size_t o = pos % o6; - - size_t input_idx = (i * s0 + b0) * i1 * i2 * i3 * i4 * i5 * i6 + (j * s1 + b1) * i2 * i3 * i4 * i5 * i6 + - (k * s2 + b2) * i3 * i4 * i5 * i6 + (l * s3 + b3) * i4 * i5 * i6 + (m * s4 + b4) * i5 * i6 + - (n * s5 + b5) * i6 + (o * s6 + b6); - dx[input_idx] = dy[pos]; - } - return; -} - -template -void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, const T *dy, T *dx, - cudaStream_t cuda_stream) { - size_t size = dy_shape[0] * dy_shape[1] * dy_shape[2] * dy_shape[3] * dy_shape[4] * dy_shape[5] * dy_shape[6]; - StridedSliceGradKernel<<>>( - begin[0], begin[1], begin[2], begin[3], begin[4], begin[5], begin[6], strides[0], strides[1], strides[2], - strides[3], strides[4], strides[5], strides[6], dx_shape[0], dx_shape[1], dx_shape[2], dx_shape[3], dx_shape[4], - dx_shape[5], dx_shape[6], dy_shape[0], dy_shape[1], dy_shape[2], dy_shape[3], dy_shape[4], dy_shape[5], dy_shape[6], - dy, dx); -} - -template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const double *input, double *output, - cudaStream_t stream); -template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const float *input, float *output, - cudaStream_t stream); -template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const half *input, half *output, - cudaStream_t stream); -template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const int *input, int *output, - cudaStream_t stream); -template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const short *input, short *output, // NOLINT - cudaStream_t stream); -template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const unsigned char *input, - unsigned char *output, cudaStream_t stream); -template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const int64_t *input, int64_t *output, - cudaStream_t stream); -template void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const bool *input, bool *output, - cudaStream_t stream); - -template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, - const size_t d2, const double *input, double *output, cudaStream_t stream); -template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, - const size_t d2, const float *input, float *output, cudaStream_t stream); -template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, - const size_t d2, const half *input, half *output, cudaStream_t stream); -template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, - const size_t d2, const int *input, int *output, cudaStream_t stream); -template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, - const size_t d2, const short *input, short *output, cudaStream_t stream); // NOLINT -template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, - const size_t d2, const unsigned char *input, unsigned char *output, cudaStream_t stream); -template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, - const size_t d2, const int64_t *input, int64_t *output, cudaStream_t stream); -template void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, - const size_t d2, const bool *input, bool *output, cudaStream_t stream); - -template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, - const size_t l3, const size_t d1, const size_t d2, const size_t d3, const double *input, - double *output, cudaStream_t stream); -template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, - const size_t l3, const size_t d1, const size_t d2, const size_t d3, const float *input, - float *output, cudaStream_t stream); -template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, - const size_t l3, const size_t d1, const size_t d2, const size_t d3, const half *input, - half *output, cudaStream_t stream); -template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, - const size_t l3, const size_t d1, const size_t d2, const size_t d3, const int *input, - int *output, cudaStream_t stream); -template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, - const size_t l3, const size_t d1, const size_t d2, const size_t d3, const short *input, // NOLINT - short *output, cudaStream_t stream); // NOLINT -template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, - const size_t l3, const size_t d1, const size_t d2, const size_t d3, - const unsigned char *input, unsigned char *output, cudaStream_t stream); -template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, - const size_t l3, const size_t d1, const size_t d2, const size_t d3, const int64_t *input, - int64_t *output, cudaStream_t stream); -template void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, - const size_t l3, const size_t d1, const size_t d2, const size_t d3, const bool *input, - bool *output, cudaStream_t stream); - -template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const double *input, double *output, cudaStream_t stream); -template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const float *input, float *output, cudaStream_t stream); -template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const half *input, half *output, cudaStream_t stream); -template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const int *input, int *output, cudaStream_t stream); -template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const short *input, short *output, // NOLINT - cudaStream_t stream); -template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const unsigned char *input, unsigned char *output, - cudaStream_t stream); -template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const int64_t *input, int64_t *output, - cudaStream_t stream); -template void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const bool *input, bool *output, cudaStream_t stream); - -template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, - const double *input, double *output, cudaStream_t stream); -template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, - const float *input, float *output, cudaStream_t stream); -template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, - const half *input, half *output, cudaStream_t stream); -template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, - const int64_t *input, int64_t *output, cudaStream_t stream); -template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, - const int *input, int *output, cudaStream_t stream); -template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, - const short *input, short *output, cudaStream_t stream); // NOLINT -template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, - const unsigned char *input, unsigned char *output, cudaStream_t stream); -template void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, - const bool *input, bool *output, cudaStream_t stream); - -template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, - const size_t d4, const size_t d5, const size_t d6, const double *input, double *output, - cudaStream_t stream); -template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, - const size_t d4, const size_t d5, const size_t d6, const float *input, float *output, - cudaStream_t stream); -template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, - const size_t d4, const size_t d5, const size_t d6, const half *input, half *output, - cudaStream_t stream); -template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, - const size_t d4, const size_t d5, const size_t d6, const int64_t *input, int64_t *output, - cudaStream_t stream); -template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, - const size_t d4, const size_t d5, const size_t d6, const int *input, int *output, - cudaStream_t stream); -template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, - const size_t d4, const size_t d5, const size_t d6, const short *input, short *output, // NOLINT - cudaStream_t stream); // NOLINT -template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, - const size_t d4, const size_t d5, const size_t d6, const unsigned char *input, - unsigned char *output, cudaStream_t stream); -template void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t l5, const size_t l6, const size_t d1, const size_t d2, const size_t d3, - const size_t d4, const size_t d5, const size_t d6, const bool *input, bool *output, - cudaStream_t stream); - -template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, - const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, - const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const size_t d7, const double *input, double *output, cudaStream_t stream); -template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, - const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, - const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const size_t d7, const float *input, float *output, cudaStream_t stream); -template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, - const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, - const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const size_t d7, const half *input, half *output, cudaStream_t stream); -template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, - const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, - const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const size_t d7, const int64_t *input, int64_t *output, cudaStream_t stream); -template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, - const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, - const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const size_t d7, const int *input, int *output, cudaStream_t stream); -template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, - const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, - const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const size_t d7, const short *input, short *output, cudaStream_t stream); // NOLINT -template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, - const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, - const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const size_t d7, const unsigned char *input, unsigned char *output, cudaStream_t stream); -template void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, - const size_t s6, const size_t s7, const size_t l1, const size_t l2, const size_t l3, - const size_t l4, const size_t l5, const size_t l6, const size_t l7, const size_t d1, - const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const size_t d7, const bool *input, bool *output, cudaStream_t stream); - -template void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, - const double *dy, double *dx, cudaStream_t stream); -template void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const float *dy, float *dx, cudaStream_t stream); -template void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const half *dy, half *dx, cudaStream_t stream); -template void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const int *dy, int *dx, cudaStream_t stream); -template void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, // NOLINT - const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const short *dy, short *dx, // NOLINT - cudaStream_t stream); -template void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, - const unsigned char *dy, unsigned char *dx, cudaStream_t stream); -template void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, - const int64_t *dy, int64_t *dx, cudaStream_t stream); -template void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const bool *dy, bool *dx, cudaStream_t stream); - -template void FillDeviceArray(const size_t input_size, bool *addr, const float value, cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, int64_t *addr, const float value, - cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, int *addr, const float value, cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, short *addr, const float value, // NOLINT - cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, int8_t *addr, const float value, - cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, uint64_t *addr, const float value, - cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, uint32_t *addr, const float value, - cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, uint16_t *addr, const float value, - cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, unsigned char *addr, const float value, - cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, half *addr, const float value, cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, float *addr, const float value, cudaStream_t cuda_stream); -template void FillDeviceArray(const size_t input_size, double *addr, const float value, - cudaStream_t cuda_stream); - -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const bool *input, bool *output, cudaStream_t cuda_stream); -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const double *input, double *output, cudaStream_t cuda_stream); -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const float *input, float *output, cudaStream_t cuda_stream); -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const half *input, half *output, cudaStream_t cuda_stream); -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const int64_t *input, int64_t *output, cudaStream_t cuda_stream); -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const int *input, int *output, cudaStream_t cuda_stream); -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const short *input, short *output, cudaStream_t cuda_stream); // NOLINT -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const int8_t *input, int8_t *output, cudaStream_t cuda_stream); -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const uint64_t *input, uint64_t *output, cudaStream_t cuda_stream); -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const uint32_t *input, uint32_t *output, cudaStream_t cuda_stream); -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const uint16_t *input, uint16_t *output, cudaStream_t cuda_stream); -template void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, - const unsigned char *input, unsigned char *output, cudaStream_t cuda_stream); - -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, const bool *dy, - bool *dx, cudaStream_t cuda_stream); -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, - const double *dy, double *dx, cudaStream_t cuda_stream); -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, - const float *dy, float *dx, cudaStream_t cuda_stream); -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, const half *dy, - half *dx, cudaStream_t cuda_stream); -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, - const int64_t *dy, int64_t *dx, cudaStream_t cuda_stream); -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, const int *dy, - int *dx, cudaStream_t cuda_stream); -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, - const short *dy, // NOLINT - short *dx, cudaStream_t cuda_stream); // NOLINT -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, - const int8_t *dy, int8_t *dx, cudaStream_t cuda_stream); -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, - const uint64_t *dy, uint64_t *dx, cudaStream_t cuda_stream); -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, - const uint32_t *dy, uint32_t *dx, cudaStream_t cuda_stream); -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, - const uint16_t *dy, uint16_t *dx, cudaStream_t cuda_stream); -template void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, - const unsigned char *dy, unsigned char *dx, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh deleted file mode 100644 index d1780d54f7f..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Copyright 2019-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_IMPL_CUH_ - -#include -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void SliceKernel(const T *input, T *output, const size_t output_size, cudaStream_t cuda_stream, S...pack); - -template -void CalSlice4DGrad(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const T *dy, T *dx, cudaStream_t stream); - -template -void Slice1DKernel(const size_t s1, const size_t l1, const size_t d1, const T *input, T *output, cudaStream_t stream); - -template -void Slice2DKernel(const size_t s1, const size_t s2, const size_t l1, const size_t l2, const size_t d1, const size_t d2, - const T *input, T *output, cudaStream_t stream); - -template -void Slice3DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t l1, const size_t l2, const size_t l3, - const size_t d1, const size_t d2, const size_t d3, const T *input, T *output, cudaStream_t stream); - -template -void Slice4DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t l1, const size_t l2, - const size_t l3, const size_t l4, const size_t d1, const size_t d2, const size_t d3, const size_t d4, - const T *input, T *output, cudaStream_t stream); - -template -void Slice5DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t l1, - const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t d1, const size_t d2, - const size_t d3, const size_t d4, const size_t d5, const T *input, T *output, cudaStream_t stream); - -template -void Slice6DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6, - const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, const size_t l6, - const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5, const size_t d6, - const T *input, T *output, cudaStream_t stream); - -template -void Slice7DKernel(const size_t s1, const size_t s2, const size_t s3, const size_t s4, const size_t s5, const size_t s6, - const size_t s7, const size_t l1, const size_t l2, const size_t l3, const size_t l4, const size_t l5, - const size_t l6, const size_t l7, const size_t d1, const size_t d2, const size_t d3, const size_t d4, - const size_t d5, const size_t d6, const size_t d7, const T *input, T *output, cudaStream_t stream); - -template -void StridedSlice(const std::vector &input_shape, const std::vector &begin, - const std::vector &strides, const std::vector &output_shape, const T *input, - T *output, cudaStream_t cuda_stream); - -template -void StridedSliceGrad(const std::vector &dy_shape, const std::vector &begin, - const std::vector &strides, const std::vector &dx_shape, const T *dy, T *dx, - cudaStream_t cuda_stream); - -template -void FillDeviceArray(const size_t input_size, T *addr, const float value, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SLICE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cuh deleted file mode 100644 index ef6409763a0..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cuh +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SMOOTH_L1_LOSS_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SMOOTH_L1_LOSS_H_ -template -void SmoothL1Loss(const int &input_size, const float &beta, const T *prediction, const T *target, T *loss, - cudaStream_t stream); -template -void SmoothL1LossGrad(const int &input_size, const float &beta, const T *prediction, const T *target, const T *dloss, - T *dx, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SMOOTH_L1_LOSS_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cu deleted file mode 100644 index 0c1c4b67b2e..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cu +++ /dev/null @@ -1,134 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include "spacetobatch_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void SpaceToBatch(const size_t size, const T *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - T *output) { - size_t temp_stride = 0; - size_t temp_pos = 0; - size_t idx_in = 0; - size_t idx_ic = 0; - size_t idx_ih = 0; - size_t idx_iw = 0; - size_t idx_on = 0; - size_t output_pos = 0; - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; - pos += blockDim.x * gridDim.x) { - temp_stride = ic * ih * iw; - idx_in = pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= ic; - idx_ic = temp_pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= ih; - idx_ih = temp_pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= iw; - idx_iw = temp_pos / temp_stride; - - idx_on = (((idx_ih + pad_up) % block_num) * block_num + ((idx_iw + pad_lft) % block_num)) * in + idx_in; - output_pos = idx_on * oc; - output_pos = (output_pos + idx_ic) * oh; - output_pos = (output_pos + ((idx_ih + pad_up) - (idx_on / (in * block_num))) / block_num) * ow; - output_pos = (output_pos + ((idx_iw + pad_lft) - ((idx_on / in) % block_num)) / block_num); - output[output_pos] = input[pos]; - } - return; -} - -template -void CalSpaceToBatch(const size_t size, const T *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - T *output, cudaStream_t cuda_stream) { - cudaMemset(output, 0, on * oc * oh * ow * sizeof(T)); - SpaceToBatch<<>>( - size, input, in, ih, iw, ic, on, oh, ow, oc, pad_up, pad_dn, pad_lft, pad_rht, block_num, output); - return; -} - -template void CalSpaceToBatch(const size_t size, const float *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - float *output, cudaStream_t cuda_stream); -template void CalSpaceToBatch(const size_t size, const half *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - half *output, cudaStream_t cuda_stream); -template void CalSpaceToBatch(const size_t size, const int *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - int *output, cudaStream_t cuda_stream); -template void CalSpaceToBatch(const size_t size, const int64_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - int64_t *output, cudaStream_t cuda_stream); -template void CalSpaceToBatch(const size_t size, const int16_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - int16_t *output, cudaStream_t cuda_stream); -template void CalSpaceToBatch(const size_t size, const int8_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - int8_t *output, cudaStream_t cuda_stream); -template void CalSpaceToBatch(const size_t size, const uint8_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - uint8_t *output, cudaStream_t cuda_stream); -template void CalSpaceToBatch(const size_t size, const uint16_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - uint16_t *output, cudaStream_t cuda_stream); -template void CalSpaceToBatch(const size_t size, const uint32_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - uint32_t *output, cudaStream_t cuda_stream); -template void CalSpaceToBatch(const size_t size, const uint64_t *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - uint64_t *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cuh deleted file mode 100644 index 93209f3235c..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETOBATCH_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETOBATCH_H_ -template -void CalSpaceToBatch(const size_t size, const T *input, const size_t in, - const size_t ih, const size_t iw, const size_t ic, - const size_t on, const size_t oh, const size_t ow, - const size_t oc, const size_t pad_up, const size_t pad_dn, - const size_t pad_lft, const size_t pad_rht, const size_t block_num, - T *output, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETOBATCH_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cu deleted file mode 100644 index 16905e9bbf1..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cu +++ /dev/null @@ -1,138 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "spacetodepth_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void SpaceToDepth(const size_t size, const T *input, const size_t in, - const size_t ic, const size_t ih, const size_t iw, - const size_t on, const size_t oc, const size_t oh, - const size_t ow, const size_t r, T *output) { - size_t temp_stride = 0; - size_t temp_pos = 0; - size_t output_pos = 0; - size_t input_pos_array[SPACETODEPTH_BUFFER_DIMENSION]; - - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; - pos += blockDim.x * gridDim.x) { - temp_stride = ic * ih * iw; - input_pos_array[0] = pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= ic; - input_pos_array[1] = temp_pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= ih; - input_pos_array[2] = temp_pos / temp_stride; - temp_pos = pos % temp_stride; - - temp_stride /= iw; - input_pos_array[3] = temp_pos / temp_stride; - - output_pos += input_pos_array[0]; - output_pos = (output_pos * oc) + - (input_pos_array[1] + - (r * (input_pos_array[2] % r) + input_pos_array[3] % r) * ic); - output_pos = (output_pos * oh) + (input_pos_array[2] / r); - output_pos = (output_pos * ow) + (input_pos_array[3] / r); - - output[output_pos] = input[pos]; - output_pos = 0; - } - return; -} - -template -void CalSpaceToDepth(const size_t size, const T *input, const size_t in, - const size_t ic, const size_t ih, const size_t iw, - const size_t on, const size_t oc, const size_t oh, - const size_t ow, const size_t r, T *output, - cudaStream_t cuda_stream) { - SpaceToDepth<<>>( - size, input, in, ic, ih, iw, on, oc, oh, ow, r, output); - return; -} - -template void CalSpaceToDepth(const size_t size, const float *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, float *output, - cudaStream_t cuda_stream); -template void CalSpaceToDepth(const size_t size, const half *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, half *output, - cudaStream_t cuda_stream); -template void CalSpaceToDepth(const size_t size, const int *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, int *output, - cudaStream_t cuda_stream); -template void CalSpaceToDepth(const size_t size, const int64_t *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, int64_t *output, - cudaStream_t cuda_stream); -template void CalSpaceToDepth(const size_t size, const int16_t *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, int16_t *output, - cudaStream_t cuda_stream); -template void CalSpaceToDepth(const size_t size, const int8_t *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, int8_t *output, - cudaStream_t cuda_stream); -template void CalSpaceToDepth(const size_t size, const uint8_t *input, - const size_t in, const size_t ic, - const size_t ih, const size_t iw, - const size_t on, const size_t oc, - const size_t oh, const size_t ow, - const size_t r, uint8_t *output, - cudaStream_t cuda_stream); -template void -CalSpaceToDepth(const size_t size, const uint16_t *input, - const size_t in, const size_t ic, const size_t ih, - const size_t iw, const size_t on, const size_t oc, - const size_t oh, const size_t ow, const size_t r, - uint16_t *output, cudaStream_t cuda_stream); -template void -CalSpaceToDepth(const size_t size, const uint32_t *input, - const size_t in, const size_t ic, const size_t ih, - const size_t iw, const size_t on, const size_t oc, - const size_t oh, const size_t ow, const size_t r, - uint32_t *output, cudaStream_t cuda_stream); -template void -CalSpaceToDepth(const size_t size, const uint64_t *input, - const size_t in, const size_t ic, const size_t ih, - const size_t iw, const size_t on, const size_t oc, - const size_t oh, const size_t ow, const size_t r, - uint64_t *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cuh deleted file mode 100644 index 85ef76460c9..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cuh +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETODEPTH_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETODEPTH_H_ - -#define SPACETODEPTH_BUFFER_DIMENSION 4 -template -void CalSpaceToDepth(const size_t size, const T *input, const size_t in, - const size_t ic, const size_t ih, const size_t iw, - const size_t on, const size_t oc, const size_t oh, - const size_t ow, const size_t r, T *output, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPACETODEPTH_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cuh deleted file mode 100644 index fc133b00b73..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMP_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMP_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void CalSparseApplyProximalAdagrad(const size_t size, const size_t indices_size, const T *learning_rate, - const T *l1_regularization, const T *l2_regularization, const T *gradient, - const int *indices, T *variable, T *accumulation, T *variable_out, - T *accumulation_out, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMP_SPARSE_APPLY_PROXIMAL_ADAGRAD_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cuh deleted file mode 100755 index 3a32c6e36a2..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_cross_entropy_cuda_impl.cuh +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPARSECROSSENTROPYCUDAIMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPARSECROSSENTROPYCUDAIMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void CalCrossEntropy(const float *logits, T *labels, const int batch_size, const int class_num, float *loss, - cudaStream_t cuda_stream); - -template -void CalCrossEntropyGrad(const float *logits, T *labels, const int batch_size, const int class_num, float *grad, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPARSECROSSENTROPYCUDAIMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cuh deleted file mode 100644 index c5cebe6d0fd..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cuh +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SPARSE_FTRL_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SPARSE_FTRL_IMPL_H_ -template -void CalSparseApplyFtrl(const T *gradient, const S *indices, const int num_index, const size_t n_stride, - const float learning_rate, const float l1_regularization, const float l2_regularization, - const float learning_rate_power, const bool use_locking, T *variable, T *accumulation, - T *linear, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SPARSE_FTRL_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cu deleted file mode 100755 index 359a64b4cd5..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/split_impl.cu +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include "plugin/device/gpu/kernel/cuda_impl/split_impl.cuh" -template -__global__ void Split(const size_t size, const int axis_step, const int all_size_before_axis, - const int all_size_axis, const T* input, T** outputs) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { - int num = pos % all_size_before_axis / all_size_axis; - int block = num / axis_step; - int block_pos = pos / all_size_before_axis * axis_step * all_size_axis + - num % axis_step * all_size_axis + pos % all_size_axis; - outputs[block][block_pos] = input[pos]; - } - return; -} - -template -void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, - const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) { - Split<<>>(size, axis_step, all_size_before_axis, - all_size_axis, input, outputs); - return; -} - -template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, - const int all_size_axis, const half* input, half** outputs, - cudaStream_t cuda_stream); -template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, - const int all_size_axis, const float* input, float** outputs, - cudaStream_t cuda_stream); -template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, - const int all_size_axis, const double* input, double** outputs, - cudaStream_t cuda_stream); -template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, - const int all_size_axis, const int* input, int** outputs, - cudaStream_t cuda_stream); -template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, - const int all_size_axis, const uint32_t* input, uint32_t** outputs, - cudaStream_t cuda_stream); -template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, - const int all_size_axis, const int64_t* input, int64_t** outputs, - cudaStream_t cuda_stream); -template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis, - const int all_size_axis, const bool* input, bool** outputs, - cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cu index 3400326d011..4b5084008ba 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void AngleAtomEnergyKernel(int angle_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh index e21047e456d..f09f72de66a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_ANGLE_ANGLE_ATOM_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void AngleAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cu index 717125297de..c57befff8eb 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void AngleEnergyKernel(int angle_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh index 0a824658052..be75db9a29c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_ANGLE_ANGLE_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void AngleEnergy(int angle_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k, const float *angle_theta0, float *ene, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cu index f441c9a54d4..6457de26a8f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void AngleForceKernel(int angle_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh index b150c2e8d1e..24276a23860 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_ANGLE_ANGLE_FORCE_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void AngleForce(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k, const float *angle_theta0, float *frc_f, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cu index d9d2f2da601..b186683585a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void AngleForceWithAtomEnergyKernel(int angle_numbers, const UNSIGNED_INT_VECTOR *uint_crd, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh index d4f11819e48..c8f647fc1e8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_ANGLE_ANGLE_FORCE_WITH_ATOM_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void AngleForceWithAtomEnergy(int angle_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const int *atom_c, const float *angle_k, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cu index 96f71952d9a..bab1f905c68 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cu @@ -16,7 +16,7 @@ #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void BondAtomEnergyCudaKernel(const int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, const int *atom_a, const int *atom_b, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh index edeb4e881c8..832eea77397 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_atom_energy_cuda_gpu_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_ATOM_ENERGY_GPU_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void BondAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0, float *atom_ene, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cu index 34143947e23..bb1007036c6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cu @@ -16,7 +16,7 @@ #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void BondEnergyCudaKernel(const int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh index 5cd6514b4e0..fbba36e5d38 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_energy_cuda_gpu_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_ENERGY_CUDA_GPU_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void BondEnergy(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cu index c2a074f0b75..d3ae9f8f47b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cu @@ -16,7 +16,7 @@ #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void BondForceCudaKernel(int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh index b72a408fda6..e401f9deba4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_cuda_gpu_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_FORCE_CUDA_GPU_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void BondForce(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cu index 2161f2b422d..fab62a62d90 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cu @@ -16,7 +16,7 @@ #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void BondForceWithAtomEnergyAndVirialKernel(const int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, const int *atom_a, const int *atom_b, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh index 1e1438c8434..98319adde34 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_FORCE_WITH_ATOM_VIRIAL_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void BondForceWithAtomEnergyAndVirial(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cu index 353fa4da59a..11064927158 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cu @@ -16,7 +16,7 @@ #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void BondForceWithAtomEnergyKernel(int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, const int *atom_a, const int *atom_b, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh index 06fe3247697..4b11524afc9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_energy_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_FORCE_WITH_ATOM_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void BondForceWithAtomEnergy(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cu index 83f9a0f3c18..78dd7f56754 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cu @@ -16,7 +16,7 @@ #include "plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void BondForceWithAtomVirialKernel(int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, const int *atom_a, const int *atom_b, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh index 33b498dc4d6..9757c081ea1 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/bond/bond_force_with_atom_virial_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_FORCE_WITH_ATOM_VIRIAL_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void BondForceWithAtomVirial(int bond_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k, const float *bond_r0, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cu index bf887ada532..8c9abb85541 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cu @@ -16,7 +16,7 @@ #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __device__ __host__ float fc(float Rij) { const float PI = 3.141592654; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh index e3ee3e875d0..0c235d4fd9d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/atomcrdtocv_impl.cuh @@ -17,7 +17,7 @@ #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_ATOMCRDTOCV_IMPL_H_ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_ATOMCRDTOCV_IMPL_H_ -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void AtomCrdToCV(int atom_numbers, int start_serial, int end_serial, int number, const float *crd_f, const float *old_crd, float *nowarp_crd, int *box_map_times, float *box, float *g_radial, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh index 0b3197810e2..5ab563f8e1c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f, unsigned int *uint_crd_f, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh index f26cb13e5a8..18b531fc58d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_QUARTER_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void CrdToUintCrdQuarter(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f, unsigned int *uint_crd_f, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh index becb653f48a..e3fcfb3bcce 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTEROFMASS_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void GetCenterOfMass(int residue_numbers, int *start, int *end, float *crd_f, float *atom_mass, float *residue_mass_inverse, float *center_of_mass_f, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh index 4b168216ddb..6355344e09f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTER_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void GetCenterOfGeometry(const int center_numbers, float center_numbers_inverse, const int *center_atoms, const float *crd_f, float *center_of_geometry_f, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh index fad2d93c90d..c60c9a4df4c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MAPCENTEROFMASS_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void MapCenterOfMass(int residue_numbers, int *start, int *end, float *center_of_mass_f, float *box_length_f, float *no_wrap_crd_f, float *crd_f, float* scaler, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh index 4fd44c8567c..1a41f87f92e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MDTEMPERATURE_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void MDTemperature(const int residue_numbers, const int *start, const int *end, const float *atom_vel_f, const float *atom_mass, float *ek, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh index ed256448166..1f2bdb4251e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh @@ -17,7 +17,7 @@ #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_TOTAL_C6_GET_IMPL_H_ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_TOTAL_C6_GET_IMPL_H_ -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void total_c6_get(int atom_numbers, int *atom_lj_type, float *d_lj_b, float *d_factor, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh index 270427d31ce..349de212bec 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh @@ -27,7 +27,7 @@ #include #include #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #define TWO_DIVIDED_BY_SQRT_PI 1.1283791670218446 #define CONSTANT_kB 0.00198716 diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cu index ba87498ab3f..eef09d37a45 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cu @@ -17,7 +17,7 @@ #include "plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void Calculate_No_Wrap_Crd(int atom_numbers, INT_VECTOR *box_map_times, VECTOR *box, VECTOR *crd, VECTOR *nowrap_crd) { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh index fce32ed57ed..27116d8aa26 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/cal_no_wrap_crd_impl.cuh @@ -19,7 +19,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRDMCMAP_CAL_NO_WRAP_CRD_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void calculatenowrapcrd(int atom_numbers, int *box_map_times_f, float *box_f, float *crd_f, float *nowrap_crd_f, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh index 953b0e20131..3218bd5fba5 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/crdmcmap/refresh_boxmaptimes_impl.cuh @@ -19,7 +19,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRDMCMAP_REFRESH_BOXMAPTIMES_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void refresh_boxmaptimes(int atom_numbers, float *box_length_inverse, float *crd_f, float *old_crd_f, int *box_map_times_f, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cu index 76d92f18bdc..4803c11f25f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void DihedralAtomEnergyKernel(int dihedral_numbers, const UNSIGNED_INT_VECTOR *uint_crd, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh index e421de8f79d..56d1baba7b9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_DIHEDRAL_DIHEDRAL_ATOM_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void DihedralAtomEnergy(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cu index c08b888f07f..ea58fa92c31 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void DihedralEnergyKernel(int dihedral_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh index fb485172d0e..eb8dc46e75c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_DIHEDRAL_DIHEDRAL_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void DihedralEnergy(int dihedral_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn, const float *pk, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cu index 089679b0752..e90780cf1ec 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void DihedralForceKernel(int dihedral_numbers, const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh index 00e08b58446..5804e3b825b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_DIHEDRAL_DIHEDRAL_FORCE_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void DihedralForce(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d, const int *ipn, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cu index dc6936a475a..46874a0c78a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void DihedralForceWithAtomEnergyKernel(int dihedral_numbers, const UNSIGNED_INT_VECTOR *uint_crd, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh index 6841567f4c6..dc9ca6ea818 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_DIHEDRAL_DIHEDRAL_FORCE_WITH_ATOM_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void DihedralForceWithAtomEnergy(int dihedral_numbers, int atom_numbers, const int *uint_crd_f, const float *scaler_f, const int *atom_a, const int *atom_b, const int *atom_c, const int *atom_d, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh index 7e85481ec20..b51a52ef1a4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh @@ -22,7 +22,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LJ_DIRECT_CF_FORCE_WITH_LJ_VIRIAL_DIRECT_CF_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy( const int atom_numbers, const float cutoff, const float pme_beta, const unsigned int *uint_crd_f, const int *LJtype, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh index 19290c90ef2..25045a5ebed 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void LJEnergy(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype, const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh index 904a4227ee0..c37fdcdea97 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_FORCE_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void LJForce(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype, const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh index 54711ae355a..2bd59ee88e0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_FORCE_WITH_PME_DIRECT_FORCE_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void LJForceWithPMEDirectForce(const int atom_numbers, const float cutoff, const float pme_beta, const int *uint_crd_f, const int *LJtype, const float *charge, const float *scaler_f, float *uint_crd_with_LJ, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh index bdda30249c0..0e22abe4352 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_PME_DIRECT_FORCE_WITH_ATOM_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void LJDirectCFForceWithAtomEnergy(const int atom_numbers, const float cutoff, const float pme_beta, const int *uint_crd_f, const int *LJtype, const float *charge, const float *scaler_f, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh index 7ccc500226f..616ce19b6d8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ATOM_ENERGY_IMPL_H #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void Dihedral14CFAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh index 52a588b53cf..dfc31a357f0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ENERGY_IMPL_H #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void Dihedral14CFEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype, const float *charge, float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh index 7b443c789ae..286bc9d5ac0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ATOM_ENERGY_IMPL_H #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void Dihedral14LJAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh index 74153b6739c..914d6e0a010 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void Dihedral14LJCFForceWithAtomEnergyAndVirial(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype, const float *charge, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh index 27208029f31..ad2957b79c0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void Dihedral14LJCFForceWithAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype, const float *charge, float *uint_crd_with_LJ_f, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh index 04cc1a2849b..cd9c125c899 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ENERGY_IMPL_H #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void Dihedral14LJEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype, const float *charge, float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh index e107310d5b1..ab67d0c076c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_IMPL_H #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void Dihedral14LJForce(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14, const int *b_14, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh index c25dd8d06fb..0b97b56b685 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_WITH_DIRECT_CF_IMPL_H #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void Dihedral14LJForceWithDirectCF(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh index 8f4029f036d..a91b582c5d3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh @@ -21,7 +21,7 @@ #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_IMPL_H_ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" struct VECTOR { float x; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cu index b129d9dd952..fc2102e52c0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void MD_Iteration_Gradient_Descent(const int atom_numbers, VECTOR *crd, VECTOR *frc, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh index f1f829014f4..0455f49219a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_gradient_descent_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_GRADIENT_DESCENT_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void MDIterationGradientDescent(const int atom_numbers, float *crd, float *frc, const float learning_rate, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh index e87a9ed85bb..af5756a7fa6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh @@ -22,7 +22,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NVTIT_MD_ITERATION_LEAP_FROG_IMPL_H #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void MDIterationLeapFrog(const int atom_numbers, float *vel, float *crd, float *frc, float *acc, const float *inverse_mass, const float dt, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cu index 7c8237112b9..a2a636726b8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void MD_Iteration_Leap_Frog_With_LiuJian_kernel(const int atom_numbers, const float half_dt, const float dt, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh index 1595d4f845a..58bd87184c4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, const float half_dt, const float dt, const float exp_gamma, int float4_numbers, float *inverse_mass, float *sqrt_mass_inverse, float *vel, float *crd, float *frc, float *acc, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cu index 3e029614c41..b8c53e91acf 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void MD_Iteration_Leap_Frog_With_LiuJian_With_Max_Velocity( diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh index 6e554f6ae8a..cf832952d7b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_with_max_vel_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_WITH_MAX_VEL_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void MD_Iteration_Leap_Frog_With_LiuJian_With_Max_Vel(const int atom_numbers, const float half_dt, const float dt, const float exp_gamma, int float4_numbers, float *inverse_mass, float *sqrt_mass_inverse, float *vel, float *crd, float *frc, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cu index 3eaad366d8b..60d96b1c008 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" __global__ void MD_Iteration_Leap_Frog_With_Max_Velocity(const int atom_numbers, VECTOR *vel, VECTOR *crd, VECTOR *frc, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh index 047610dcaea..4c6fa476483 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_with_max_vel_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_WITH_MAX_VEL_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void MDIterationLeapFrogWithMaxVelocity(const int atom_numbers, float *vel, float *crd, float *frc, float *acc, const float *inverse_mass, const float dt, const float max_velocity, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cu index d1254924886..54138754bd4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cu @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh index 3f8185ca556..b3d268c3013 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_SETUP_RANDOM_STATE_GPU_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed, cudaStream_t stream); #endif diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh index 77a27a1b2b3..bb2f6c87bab 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh @@ -17,8 +17,8 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_FFT_3D_IMPL_H_ #include -#include "utils/complex.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template void FFT3D(int Nfft, T *input_tensor, Complex *output_tensor, const cufftHandle &FFT_plan_r2c, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh index b184bc1c564..68d12d08cae 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh @@ -17,8 +17,8 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_IFFT_3D_IMPL_H_ #include -#include "utils/complex.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" template void IFFT3D(int Nfft, Complex *input_tensor, T *output_tensor, const cufftHandle &FFT_plan_c2r, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh index 8ee149cd9db..f5697b779ee 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, const int *uint_crd_f, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh index d565ae57a66..49de269ca66 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_UPDATE_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void PMEEnergyUpdate(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh index 12cec6454f3..8cbbf282054 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_EXCLUDED_FORCE_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void PMEExcludedForce(const int atom_numbers, const float pme_beta, const int *uint_crd_f, const float *sacler_f, const float *charge, const int *excluded_list_start, const int *excluded_list, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh index 5393743e619..e21cd655f6c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh @@ -17,7 +17,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_RECIPROCAL_FORCE_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" struct _VECTOR { float x; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cu index b3c65e9bf97..546f1a96f72 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cu @@ -16,7 +16,7 @@ #include "plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void restrain_energy_kernel(const int restrain_numbers, const int *restrain_list, const VECTOR *crd, const VECTOR *crd_ref, const float weight, const VECTOR boxlength, float *ene) { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh index f7dbfeabb1a..a58fca6cbfe 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_energy_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_RESTRAIN_RESTRAIN_ENERGY_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void restrainenergy(int restrain_numbers, int atom_numbers, float weight, const int *restrain_list, const float *crd_f, const float *crd_ref, const float *boxlength_f, float *ene, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh index 3e175457b5a..94f3398e8d8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_atom_energy_virial_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_RESTRAIN_RESTRAIN_FORCE_ATOM_ENERGY_VIRIAL_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void restrainforcewithatomenergyandvirial(int restrain_numbers, int atom_numbers, const int *restrain_list, const float *crd_f, const float *crd_ref_f, const float weight, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cu index 779484179d4..cff6a302b80 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cu @@ -16,7 +16,7 @@ #include "plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common_sponge.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh" __global__ void restrainforcekernel(int restrain_numbers, const int *restrain_list, const UNSIGNED_INT_VECTOR *uint_crd, const UNSIGNED_INT_VECTOR *uint_crd_ref, const float factor, const VECTOR *scaler, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh index bfc67e8da69..1788b9c55c3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/restrain/restrain_force_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_RESTRAIN_RESTAIN_FORCE_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void restrainforce(int restrain_numbers, int atom_numbers, const int *restrain_list, const int *uint_crd_f, const int *uint_crd_ref, const float factor, const float *scaler_f, float *frc_f, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh index 41ffceaff3a..6af297ba7a0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh @@ -22,7 +22,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_CYCLE_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void Constrain_Force_Cycle(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f, const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh index fc96cde5cde..3c77e8d0355 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh @@ -22,7 +22,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_CYCLE_WITH_VIRIAL_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void Constrain_Force_Cycle_With_Virial(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f, const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh index e365d0ce765..05b351c053b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh @@ -22,7 +22,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_VIRIAL_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void constrain_force_cycle_update(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f, const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh index 6152cab2026..2d71397680a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/last_crd_to_dr_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_LAST_CRD_TO_DR_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void lastcrdtodr(int constrain_pair_numbers, const float *atom_crd_f, const float *quarter_crd_to_uint_crd_cof_f, const float *uint_dr_to_dr_f, float *constrain_pair_f, const int *atom_i_serials, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh index f5de7eb46b7..a3c3d40f452 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_crd_vel_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_REFRESH_CRD_VEL_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void refreshcrdvel(int atom_numbers, float dt_inverse, float dt, float exp_gamma, float half_exp_gamma_plus_half, float *test_frc_f, float *mass_inverse, float *crd_f, float *vel_f, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh index b74aa1edcea..ecdb5c7ec72 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sponge/simple_constrain/refresh_uint_crd_impl.cuh @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_REFRESH_UINT_CRD_IMPL_H_ #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" void refreshuintcrd(int atom_numbers, float half_exp_gamma_plus_half, const float *crd_f, const float *quarter_crd_to_uint_crd_cof_f, const float *test_frc_f, const float *mass_inverse, diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cu deleted file mode 100644 index e9ae27d86d5..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cu +++ /dev/null @@ -1,199 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cuh" - -const int kWarpSize = 32; -const int kNumWarps = 32; - -__inline__ __device__ float HalfFloatInputConvert(const half val) { return __half2float(val); } -__inline__ __device__ float HalfFloatInputConvert(const float val) { return val; } -__inline__ __device__ void HalfFloatOutputAssign(const float val, float *arr, int idx) { arr[idx] = val; } -__inline__ __device__ void HalfFloatOutputAssign(const float val, half *arr, int idx) { arr[idx] = __float2half(val); } - -template -__global__ void SyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, - G *saved_mean, G *saved_invstd, float *dy_sum_local, float *dot_p_local) { - // block level memory - __shared__ float shared_dy[kNumWarps]; - __shared__ float shared_dot_p[kNumWarps]; - int warpId = threadIdx.x / kWarpSize; // threads are arranged in warps of 32 executed together - int laneId = threadIdx.x % kWarpSize; - - int plane = blockIdx.x; // this thread will only function on a single plane - int plane_size = N * H * W; - float mean = static_cast(saved_mean[plane]); - - if (threadIdx.x < kNumWarps) { - shared_dy[threadIdx.x] = static_cast(0); - shared_dot_p[threadIdx.x] = static_cast(0); - } - - __syncthreads(); // ensure all 0 init complete across all values - - float dy_sum = static_cast(0); - float dot_p = static_cast(0); - - // individual thread level reduction - for (int x = threadIdx.x; x < plane_size; x += blockDim.x) { - int index = (x / (H * W) * C * H * W) + (plane * H * W) + (x % (H * W)); - float input_value = HalfFloatInputConvert(x_input[index]); - float dy_value = HalfFloatInputConvert(dy[index]); - dy_sum += dy_value; - dot_p += (input_value - mean) * dy_value; - } - __syncthreads(); - // warp reduce all values in every value to a single value - for (int offset = kWarpSize / 2; offset > 0; offset /= 2) { - float other_dy_sum = __shfl_down_sync(0xffffffff, dy_sum, offset); - float other_dot_p = __shfl_down_sync(0xffffffff, dot_p, offset); - dy_sum += other_dy_sum; - dot_p += other_dot_p; - } - __syncwarp(); - if (laneId == 0) { - shared_dy[warpId] = dy_sum; - shared_dot_p[warpId] = dot_p; - // one value per warp now - } - __syncthreads(); - if (warpId == 0) { - dy_sum = shared_dy[laneId]; - dot_p = shared_dot_p[laneId]; - __syncwarp(); - for (int offset = kWarpSize / 2; offset > 0; offset /= 2) { - float other_dy = __shfl_down_sync(0xffffffff, dy_sum, offset); - float other_dot_p = __shfl_down_sync(0xffffffff, dot_p, offset); - dy_sum += other_dy; - dot_p += other_dot_p; - } - __syncwarp(); - } - if (threadIdx.x == 0) { - dy_sum_local[plane] = dy_sum; - dot_p_local[plane] = dot_p; - } - return; -} - -template -__global__ void SyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, T *dx, - G *saved_mean, G *saved_invstd, float *dy_sum_red, float *dot_p_red, S *scale, - S *dscale, S *dbias, float epsilon) { - int size = N * C * H * W; - int plane_size = N * H * W; - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { - int block_num = (pos / W) / H; // which of N * C blocks - int plane = block_num % C; - float mean = HalfFloatInputConvert(saved_mean[plane]); - float invstd = HalfFloatInputConvert(saved_invstd[plane]); - float scale_value = HalfFloatInputConvert(scale[plane]); - float div_factor = HalfFloatInputConvert(1) / plane_size; - float dy_sum_plane = dy_sum_red[plane]; - float dot_p_plane = dot_p_red[plane]; - float grad_mean = dy_sum_plane * div_factor; - float proj_scale = dot_p_plane * div_factor * invstd * invstd; - float grad_scale = invstd * scale_value; - float inp = HalfFloatInputConvert(x_input[pos]); - float proj = (inp - mean) * proj_scale; - HalfFloatOutputAssign((HalfFloatInputConvert(dy[pos]) - proj - grad_mean) * grad_scale, dx, pos); - } -} - -template -__global__ void SyncBatchNormGradPostScaleBias(size_t C, G *saved_invstd, float *dy_sum_red, float *dot_p_red, - S *dscale, S *dbias) { - for (size_t plane = blockIdx.x * blockDim.x + threadIdx.x; plane < C; plane += blockDim.x * gridDim.x) { - float invstd = HalfFloatInputConvert(saved_invstd[plane]); - float dy_sum_plane = dy_sum_red[plane]; - float dot_p_plane = dot_p_red[plane]; - dscale[plane] = static_cast(dot_p_plane * invstd); - dbias[plane] = static_cast(dy_sum_plane); - } -} - -template -void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, G *saved_mean, - G *saved_invstd, float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream) { - SyncBatchNormGradPre<<>>(N, C, H, W, x_input, dy, saved_mean, saved_invstd, - dy_sum_local, dot_p_local); - return; -} -template -void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, T *dx, - G *saved_mean, G *saved_invstd, float *dy_sum_red, float *dot_p_red, S *scale, S *dscale, - S *dbias, float epsilon, cudaStream_t cuda_stream) { - SyncBatchNormGradPost<<>>(N, C, H, W, x_input, dy, dx, saved_mean, saved_invstd, - dy_sum_red, dot_p_red, scale, dscale, dbias, epsilon); - SyncBatchNormGradPostScaleBias<<(GET_THREADS)), 0, cuda_stream>>>( - C, saved_invstd, dy_sum_red, dot_p_red, dscale, dbias); -} -// PRE FUNCTION -template void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const float *x_input, - const float *dy, float *saved_mean, float *saved_invstd, - float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream); -template void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const float *x_input, - const float *dy, half *saved_mean, half *saved_invstd, - float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream); -template void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const half *x_input, - const half *dy, float *saved_mean, float *saved_invstd, - float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream); -template void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const half *x_input, - const half *dy, half *saved_mean, half *saved_invstd, - float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream); -// POST FUNCTION -template void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, - const float *x_input, const float *dy, float *dx, - float *saved_mean, float *saved_invstd, float *dy_sum_red, - float *dot_p_red, float *scale, float *dscale, float *dbias, - float epsilon, cudaStream_t cuda_stream); -template void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const half *x_input, - const half *dy, half *dx, float *saved_mean, - float *saved_invstd, float *dy_sum_red, float *dot_p_red, - float *scale, float *dscale, float *dbias, float epsilon, - cudaStream_t cuda_stream); -template void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const float *x_input, - const float *dy, float *dx, float *saved_mean, - float *saved_invstd, float *dy_sum_red, float *dot_p_red, - half *scale, half *dscale, half *dbias, float epsilon, - cudaStream_t cuda_stream); -template void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const half *x_input, - const half *dy, half *dx, float *saved_mean, - float *saved_invstd, float *dy_sum_red, float *dot_p_red, - half *scale, half *dscale, half *dbias, float epsilon, - cudaStream_t cuda_stream); -template void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const float *x_input, - const float *dy, float *dx, half *saved_mean, - half *saved_invstd, float *dy_sum_red, float *dot_p_red, - float *scale, float *dscale, float *dbias, float epsilon, - cudaStream_t cuda_stream); -template void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const half *x_input, - const half *dy, half *dx, half *saved_mean, - half *saved_invstd, float *dy_sum_red, float *dot_p_red, - float *scale, float *dscale, float *dbias, float epsilon, - cudaStream_t cuda_stream); -template void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const float *x_input, - const float *dy, float *dx, half *saved_mean, - half *saved_invstd, float *dy_sum_red, float *dot_p_red, - half *scale, half *dscale, half *dbias, float epsilon, - cudaStream_t cuda_stream); -template void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const half *x_input, - const half *dy, half *dx, half *saved_mean, half *saved_invstd, - float *dy_sum_red, float *dot_p_red, half *scale, half *dscale, - half *dbias, float epsilon, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cuh deleted file mode 100644 index 9378cde8580..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -// /** -// * Copyright 2021 Huawei Technologies Co., Ltd -// * -// * Licensed under the Apache License, Version 2.0 (the "License"); -// * you may not use this file except in compliance with the License. -// * You may obtain a copy of the License at -// * -// * http://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_GRAD_IMPL_CUH -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_GRAD_IMPL_CUH -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void CalSyncBatchNormGradPre(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, G *saved_mean, - G *invstd_saved, float *dy_sum_local, float *dot_p_local, cudaStream_t cuda_stream); -template -void CalSyncBatchNormGradPost(size_t N, size_t C, size_t H, size_t W, const T *x_input, const T *dy, T *dx, - G *saved_mean, G *invstd_saved, float *dy_sum_red, float *dot_p_red, S *scale, S *dscale, - S *dbias, float epsilon, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_GRAD_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cuh deleted file mode 100644 index 4a12e2b3b92..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cuh +++ /dev/null @@ -1,33 +0,0 @@ -// /** -// * Copyright 2021 Huawei Technologies Co., Ltd -// * -// * Licensed under the Apache License, Version 2.0 (the "License"); -// * you may not use this file except in compliance with the License. -// * You may obtain a copy of the License at -// * -// * http://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_IMPL_CUH -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_IMPL_CUH -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void CalSyncBatchNormPre(size_t N, size_t C, size_t H, size_t W, const T *input, int *output_n, float *means_local, - float *invstds_local, float epsilon, cudaStream_t cuda_stream); -template -void CalSyncBatchNormGather(size_t N, size_t C, size_t H, size_t W, int *counts_global, float *means_global, - float *invstds_global, int *counts_local, float *means_local, float *invstds_local, - T *running_mean_output, T *running_var_output, G *running_mean_input, G *running_var_input, - float epsilon, float momentum, size_t group_rank, size_t group_size, - cudaStream_t cuda_stream); -template -void CalSyncBatchNormPost(size_t N, size_t C, size_t H, size_t W, const T *input, T *output, float *means_local, - float *invstds_local, S *scale, S *bias, S *output_scale, S *output_bias, float epsilon, - cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SYNC_BATCH_NORM_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cu deleted file mode 100644 index 7e8154a0510..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cu +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void TensorScatterAddKernel(T *input, S *indices, T *update, T *output, const size_t block_size, - const size_t input_size, const size_t output_size, const size_t indices_dim_0, - const size_t indices_dim_1, S *indices_stride, S *work_shape) { - int i, j; - for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; - read_index += blockDim.x * gridDim.x) { - size_t write_index = 0; - bool out_bound = false; - - i = read_index / block_size; - j = read_index % block_size; - - for (size_t k = 0; k < indices_dim_1; k++) { - S indices_i = indices[i * indices_dim_1 + k]; - out_bound |= indices_i >= work_shape[k]; - write_index += indices_i * indices_stride[k]; - } - - write_index += j; - out_bound |= write_index >= output_size; - - if (!out_bound) { - MsAtomicAdd(&output[write_index], update[read_index]); - } - } -} - -template -void TensorScatterAdd(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, - S *indices_stride, S *work_shape, cudaStream_t stream) { - TensorScatterAddKernel<<>>( - input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, - work_shape); - return; -} - -template void TensorScatterAdd(half *input, int *indices, half *update, half *output, - const size_t &block_size, const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, - int *work_shape, cudaStream_t stream); -template void TensorScatterAdd(float *input, int *indices, float *update, float *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void TensorScatterAdd(double *input, int *indices, double *update, double *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void TensorScatterAdd(char *input, int *indices, char *update, char *output, - const size_t &block_size, const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, - int *work_shape, cudaStream_t stream); -template void TensorScatterAdd(unsigned char *input, int *indices, unsigned char *update, - unsigned char *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int *indices_stride, int *work_shape, cudaStream_t stream); -template void TensorScatterAdd(int *input, int *indices, int *update, int *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, - int *work_shape, cudaStream_t stream); -template void TensorScatterAdd(double *input, int64_t *indices, double *update, double *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, - int64_t *work_shape, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh deleted file mode 100644 index f92f4e2ad99..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_ADD_IMPL_CUH -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_ADD_IMPL_CUH - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void TensorScatterAdd(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, - S *indices_stride, S *work_shape, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_ADD_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cu deleted file mode 100644 index 0f47a19dd48..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cu +++ /dev/null @@ -1,117 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void TensorScatterMaxKernel(T *input, S *indices, T *update, T *output, const size_t block_size, - const size_t input_size, const size_t output_size, const size_t indices_dim_0, - const size_t indices_dim_1, S *indices_stride, S *work_shape) { - int i, j; - for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; - read_index += blockDim.x * gridDim.x) { - size_t write_index = 0; - bool out_bound = false; - - i = read_index / block_size; - j = read_index % block_size; - - for (size_t k = 0; k < indices_dim_1; k++) { - S indices_i = indices[i * indices_dim_1 + k]; - out_bound |= indices_i >= work_shape[k]; - write_index += indices_i * indices_stride[k]; - } - - write_index += j; - out_bound |= write_index >= output_size; - - if (!out_bound) { - MsAtomicMax(&output[write_index], update[read_index]); - } - } -} - -template -void TensorScatterMax(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, - S *indices_stride, S *work_shape, cudaStream_t stream) { - TensorScatterMaxKernel<<>>( - input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, - work_shape); - return; -} - -// for int32 index -template void TensorScatterMax(half *input, int *indices, half *update, half *output, - const size_t &block_size, const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, - int *work_shape, cudaStream_t stream); - -template void TensorScatterMax(float *input, int *indices, float *update, float *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); - -template void TensorScatterMax(char *input, int *indices, char *update, char *output, - const size_t &block_size, const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, - int *work_shape, cudaStream_t stream); - -template void TensorScatterMax(unsigned char *input, int *indices, unsigned char *update, - unsigned char *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int *indices_stride, int *work_shape, cudaStream_t stream); - -template void TensorScatterMax(int *input, int *indices, int *update, int *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, - int *work_shape, cudaStream_t stream); - -// for int64 index -template void TensorScatterMax(half *input, int64_t *indices, half *update, half *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); - -template void TensorScatterMax(float *input, int64_t *indices, float *update, float *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, - int64_t *work_shape, cudaStream_t stream); - -template void TensorScatterMax(char *input, int64_t *indices, char *update, char *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); - -template void TensorScatterMax(unsigned char *input, int64_t *indices, unsigned char *update, - unsigned char *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); - -template void TensorScatterMax(int *input, int64_t *indices, int *update, int *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh deleted file mode 100644 index b09b4165a1c..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MAX_IMPL_CUH -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MAX_IMPL_CUH - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void TensorScatterMax(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, - S *indices_stride, S *work_shape, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MAX_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cu deleted file mode 100644 index f11791c9706..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cu +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void TensorScatterMinKernel(T *input, S *indices, T *update, T *output, const size_t block_size, - const size_t input_size, const size_t output_size, const size_t indices_dim_0, - const size_t indices_dim_1, S *indices_stride, S *work_shape) { - int i, j; - for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; - read_index += blockDim.x * gridDim.x) { - size_t write_index = 0; - bool out_bound = false; - - i = read_index / block_size; - j = read_index % block_size; - - for (size_t k = 0; k < indices_dim_1; k++) { - S indices_i = indices[i * indices_dim_1 + k]; - out_bound |= indices_i >= work_shape[k]; - write_index += indices_i * indices_stride[k]; - } - - write_index += j; - out_bound |= write_index >= output_size; - - if (!out_bound) { - MsAtomicMin(&output[write_index], update[read_index]); - } - } -} - -template -void TensorScatterMin(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, - S *indices_stride, S *work_shape, cudaStream_t stream) { - TensorScatterMinKernel<<>>( - input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, - work_shape); - return; -} - -// for int32 index -template void TensorScatterMin(half *input, int *indices, half *update, half *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); - -template void TensorScatterMin(float *input, int *indices, float *update, float *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); - -template void TensorScatterMin(char *input, int *indices, char *update, char *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); - -template void TensorScatterMin(unsigned char *input, int *indices, unsigned char *update, - unsigned char *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int *indices_stride, int *work_shape, cudaStream_t stream); - -template void TensorScatterMin(int *input, int *indices, int *update, int *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); - -// for int64 index -template void TensorScatterMin(half *input, int64_t *indices, half *update, half *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); - -template void TensorScatterMin(float *input, int64_t *indices, float *update, float *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, - int64_t *work_shape, cudaStream_t stream); - -template void TensorScatterMin(char *input, int64_t *indices, char *update, char *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); - -template void TensorScatterMin(unsigned char *input, int64_t *indices, unsigned char *update, - unsigned char *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); - -template void TensorScatterMin(int *input, int64_t *indices, int *update, int *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh deleted file mode 100644 index c1453c9a8ba..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MIN_IMPL_CUH -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MIN_IMPL_CUH - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void TensorScatterMin(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, - S *indices_stride, S *work_shape, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_MIN_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cu deleted file mode 100644 index 5230b844b5f..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cu +++ /dev/null @@ -1,117 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void TensorScatterSubKernel(T *input, S *indices, T *update, T *output, const size_t block_size, - const size_t input_size, const size_t output_size, const size_t indices_dim_0, - const size_t indices_dim_1, S *indices_stride, S *work_shape) { - int i, j; - for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; - read_index += blockDim.x * gridDim.x) { - size_t write_index = 0; - bool out_bound = false; - - i = read_index / block_size; - j = read_index % block_size; - - for (size_t k = 0; k < indices_dim_1; k++) { - S indices_i = indices[i * indices_dim_1 + k]; - out_bound |= indices_i >= work_shape[k]; - write_index += indices_i * indices_stride[k]; - } - - write_index += j; - out_bound |= write_index >= output_size; - - if (!out_bound) { - MsAtomicSub(&output[write_index], update[read_index]); - } - } -} - -template -void TensorScatterSub(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, - S *indices_stride, S *work_shape, cudaStream_t stream) { - TensorScatterSubKernel<<>>( - input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, - work_shape); - return; -} - -// for int32 index -template void TensorScatterSub(half *input, int *indices, half *update, half *output, - const size_t &block_size, const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, - int *work_shape, cudaStream_t stream); - -template void TensorScatterSub(float *input, int *indices, float *update, float *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); - -template void TensorScatterSub(char *input, int *indices, char *update, char *output, - const size_t &block_size, const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, - int *work_shape, cudaStream_t stream); - -template void TensorScatterSub(unsigned char *input, int *indices, unsigned char *update, - unsigned char *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int *indices_stride, int *work_shape, cudaStream_t stream); - -template void TensorScatterSub(int *input, int *indices, int *update, int *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, int *indices_stride, - int *work_shape, cudaStream_t stream); - -// for int64 index -template void TensorScatterSub(half *input, int64_t *indices, half *update, half *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); - -template void TensorScatterSub(float *input, int64_t *indices, float *update, float *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, - int64_t *work_shape, cudaStream_t stream); - -template void TensorScatterSub(char *input, int64_t *indices, char *update, char *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); - -template void TensorScatterSub(unsigned char *input, int64_t *indices, unsigned char *update, - unsigned char *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); - -template void TensorScatterSub(int *input, int64_t *indices, int *update, int *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, int64_t *work_shape, - cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh deleted file mode 100644 index 6b691c4b195..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_SUB_IMPL_CUH -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_SUB_IMPL_CUH - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void TensorScatterSub(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, - S *indices_stride, S *work_shape, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_SUB_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cu deleted file mode 100644 index db9ba141211..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cu +++ /dev/null @@ -1,108 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -__global__ void TensorScatterUpdateKernel(T *input, S *indices, T *update, T *output, const size_t block_size, - const size_t input_size, const size_t output_size, const size_t indices_dim_0, - const size_t indices_dim_1, S *indices_stride, S *work_shape) { - int i, j; - for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size; - read_index += blockDim.x * gridDim.x) { - size_t write_index = 0; - bool out_bound = false; - - i = read_index / block_size; - j = read_index % block_size; - - for (size_t k = 0; k < indices_dim_1; k++) { - S indices_i = indices[i * indices_dim_1 + k]; - out_bound |= indices_i >= work_shape[k]; - write_index += indices_i * indices_stride[k]; - } - - write_index += j; - out_bound |= write_index >= output_size; - - if (!out_bound) { - output[write_index] = update[read_index]; - } - } -} - -template -void TensorScatterUpdate(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, - S *indices_stride, S *work_shape, cudaStream_t stream) { - TensorScatterUpdateKernel<<>>( - input, indices, update, output, block_size, input_size, output_size, indices_dim_0, indices_dim_1, indices_stride, - work_shape); - return; -} - -template void TensorScatterUpdate(half *input, int *indices, half *update, half *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void TensorScatterUpdate(float *input, int *indices, float *update, float *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void TensorScatterUpdate(double *input, int *indices, double *update, double *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void TensorScatterUpdate(char *input, int *indices, char *update, char *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void TensorScatterUpdate(unsigned char *input, int *indices, unsigned char *update, - unsigned char *output, const size_t &block_size, - const size_t &input_size, const size_t &output_size, - const size_t &indices_dim_0, const size_t &indices_dim_1, - int *indices_stride, int *work_shape, cudaStream_t stream); -template void TensorScatterUpdate(int *input, int *indices, int *update, int *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void TensorScatterUpdate(bool *input, int *indices, bool *update, bool *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int *indices_stride, int *work_shape, - cudaStream_t stream); -template void TensorScatterUpdate(bool *input, int64_t *indices, bool *update, bool *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, - int64_t *work_shape, cudaStream_t stream); -template void TensorScatterUpdate(float *input, int64_t *indices, float *update, float *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, - int64_t *work_shape, cudaStream_t stream); -template void TensorScatterUpdate(double *input, int64_t *indices, double *update, double *output, - const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, - const size_t &indices_dim_1, int64_t *indices_stride, - int64_t *work_shape, cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh deleted file mode 100644 index c5e59b9fee9..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_UPDATE_IMPL_CUH -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_UPDATE_IMPL_CUH - -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void TensorScatterUpdate(T *input, S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size, - const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, - S *indices_stride, S *work_shape, cudaStream_t stream); -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TENSOR_SCATTER_UPDATE_IMPL_CUH diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cu deleted file mode 100644 index 233259155c9..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cu +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh" - -template -__global__ void Tile(const size_t output_size, const size_t input_size, const size_t shape_size, - const size_t *input_shape, const size_t *output_shape, const T *input, T *output) { - // for example 4-D: pos = pos_array[0] * output_shape[1] * output_shape[2] * output_shape[3] + - // pos_array[1] * output_shape[2] * output_shape[3] + - // pos_array[2] * output_shape[3] + - // pos_array[3] - size_t pos_array[TILE_MAX_DIMENSION]; - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_size; pos += blockDim.x * gridDim.x) { - size_t tmp_pos = pos; - size_t pos_size = output_size / output_shape[0]; - pos_array[0] = tmp_pos / pos_size; - for (size_t i = 1; i < shape_size; i++) { - tmp_pos -= pos_array[i - 1] * pos_size; - pos_size = pos_size / output_shape[i]; - pos_array[i] = tmp_pos / pos_size; - } - for (size_t i = 0; i < shape_size; i++) { - pos_array[i] = pos_array[i] % input_shape[i]; - } - pos_size = input_size; - size_t input_pos = 0; - for (size_t i = 0; i < shape_size; i++) { - pos_size /= input_shape[i]; - input_pos += (pos_array[i] * pos_size); - } - output[pos] = input[input_pos]; - } -} - -template -void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, const size_t *input_shape, - const size_t *output_shape, const T *input, T *output, cudaStream_t cuda_stream) { - Tile<<>>(output_size, input_size, shape_size, input_shape, - output_shape, input, output); - return; -} - -template void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, - const size_t *input_shape, const size_t *output_shape, const double *input, - double *output, cudaStream_t cuda_stream); -template void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, - const size_t *input_shape, const size_t *output_shape, const float *input, float *output, - cudaStream_t cuda_stream); -template void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, - const size_t *input_shape, const size_t *output_shape, const half *input, half *output, - cudaStream_t cuda_stream); -template void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, - const size_t *input_shape, const size_t *output_shape, const int16_t *input, - int16_t *output, cudaStream_t cuda_stream); -template void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, - const size_t *input_shape, const size_t *output_shape, const int *input, int *output, - cudaStream_t cuda_stream); -template void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, - const size_t *input_shape, const size_t *output_shape, const int64_t *input, - int64_t *output, cudaStream_t cuda_stream); -template void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, - const size_t *input_shape, const size_t *output_shape, const bool *input, - bool *output, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh deleted file mode 100644 index 97e43f5dcd5..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TILE_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TILE_IMPL_CUH_ - -#define TILE_MAX_DIMENSION 100 -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void CalTile(const size_t output_size, const size_t input_size, const size_t shape_size, const size_t *input_shape, - const size_t *output_shape, const T *input, T *output, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TILE_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cu deleted file mode 100755 index 9dbe1656720..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl.cu +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "transpose_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "utils/complex.h" - -template -using Complex = mindspore::utils::Complex; - -template -__global__ void Transpose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis, - const size_t shape_size, T *output) { - size_t pos_size; - size_t temp_pos; - size_t newpos; - size_t newpos_size; - size_t pos_array[TRANSPOSE_MAX_DIMENSION]; - - // for example 4-D: pos = posArray[0] * input_shape[1] * input_shape[2] * input_shape[3] + - // posArray[1] * input_shape[2] * input_shape[3] + - // posArray[2] * input_shape[3] + - // posArray[3] - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { - temp_pos = pos; - pos_size = size / input_shape[0]; - pos_array[0] = temp_pos / pos_size; - for (size_t i = 1; i < shape_size; i++) { - temp_pos -= pos_array[i - 1] * pos_size; - pos_size = pos_size / input_shape[i]; - pos_array[i] = temp_pos / pos_size; - } - - newpos = pos_array[input_axis[shape_size - 1]]; - newpos_size = 1; - for (int64_t j = shape_size - 2; j >= 0; j--) { - newpos_size *= input_shape[input_axis[j + 1]]; - newpos += pos_array[input_axis[j]] * newpos_size; - } - - output[newpos] = input[pos]; - } -} -template -void CalTranspose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis, - const size_t shape_size, T *output, cudaStream_t cuda_stream) { - Transpose<<>>(size, input, input_shape, input_axis, shape_size, - output); -} - -template void CalTranspose(const size_t size, const double *input, const size_t *input_shape, - const size_t *input_axis, const size_t shape_size, double *output, - cudaStream_t cuda_stream); -template void CalTranspose(const size_t size, const float *input, const size_t *input_shape, - const size_t *input_axis, const size_t shape_size, float *output, - cudaStream_t cuda_stream); -template void CalTranspose(const size_t size, const half *input, const size_t *input_shape, - const size_t *input_axis, const size_t shape_size, half *output, - cudaStream_t cuda_stream); -template void CalTranspose(const size_t size, const int *input, const size_t *input_shape, - const size_t *input_axis, const size_t shape_size, int *output, - cudaStream_t cuda_stream); -template void CalTranspose(const size_t size, const int64_t *input, const size_t *input_shape, - const size_t *input_axis, const size_t shape_size, int64_t *output, - cudaStream_t cuda_stream); -template void CalTranspose>(const size_t size, const Complex *input, const size_t *input_shape, - const size_t *input_axis, const size_t shape_size, Complex *output, - cudaStream_t cuda_stream); -template void CalTranspose>(const size_t size, const Complex *input, const size_t *input_shape, - const size_t *input_axis, const size_t shape_size, Complex *output, - cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh deleted file mode 100644 index e7c6306d299..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh +++ /dev/null @@ -1,33 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_OPT_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_OPT_H_ - -#include - -#define TRANSPOSE_MAX_DIMENSION 100 -template -void CalNHWC2NCHWInterface(const size_t size, const size_t shape_size, const T *d_input, const size_t *input_shape, - const size_t *input_axis, const size_t *d_input_shape, const size_t *d_input_axis, T *output, - cudaStream_t cuda_stream); - -template -void CalNCHW2NHWCInterface(const size_t size, const size_t shape_size, const T *d_input, const size_t *input_shape, - const size_t *input_axis, const size_t *d_input_shape, const size_t *d_input_axis, T *output, - cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRANSPOSE_OPT_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh deleted file mode 100644 index bdf14788930..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRIANGLEMATRIXCOPYIMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRIANGLEMATRIXCOPYIMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void TriangleMatrixCopy(const T *input, T *output, bool clean, cublasFillMode_t uplo, const size_t count, - const size_t ldb, const size_t m, cudaStream_t cuda_stream); - -template -void MatrixCopy(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TRIANGLEMATRIXCOPYIMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cuh deleted file mode 100755 index 36c5c7fbe43..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cuh +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOP_GRAD_IMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOP_GRAD_IMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -template -void SqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); -template -void RsqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); -template -void AsinGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); -template -void ACosGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); -template -void AtanGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); -template -void AsinhGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); -template -void AcoshGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); -template -void ReciprocalGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOP_GRAD_IMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cu deleted file mode 100755 index 63a942c6123..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cu +++ /dev/null @@ -1,820 +0,0 @@ -/** - * Copyright 2019-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "unary_op_impl.cuh" -template -__global__ void ExponentialKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = expf(input[i]); - } - return; -} -template <> -__global__ void ExponentialKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = exp(input[i]); - } - return; -} -template <> -__global__ void ExponentialKernel(const half *input, half *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = hexp(input[i]); - } - return; -} -template -__global__ void Expm1Kernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = expm1f(input[i]); - } - return; -} -template <> -__global__ void Expm1Kernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = expm1(input[i]); - } - return; -} -template -__global__ void LogarithmKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = logf(input[i]); - } - return; -} -template <> -__global__ void LogarithmKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = log(input[i]); - } - return; -} -template <> -__global__ void LogarithmKernel(const half *input, half *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = hlog(input[i]); - } - return; -} -template -__global__ void Log1pKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = log1pf(input[i]); - } - return; -} -template <> -__global__ void Log1pKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = log1p(input[i]); - } - return; -} -template -__global__ void ErfKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = erff(input[i]); - } - return; -} -template <> -__global__ void ErfKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = erf(input[i]); - } - return; -} -template -__global__ void ErfcKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = erfcf(input[i]); - } - return; -} -template <> -__global__ void ErfcKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = erfc(input[i]); - } - return; -} -template -__global__ void NegativeKernel(const T *input, T *output, const size_t count) { - T neg_one = -1; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = neg_one * input[i]; - } - return; -} -template -__global__ void ReciprocalKernel(const T *input, T *output, const size_t count) { - T one = 1.0; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = one / input[i]; - } - return; -} -template -__global__ void SquareKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = input[i] * input[i]; - } - return; -} -template -__global__ void SqrtKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = sqrtf(input[i]); - } - return; -} -template <> -__global__ void SqrtKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = sqrt(input[i]); - } - return; -} -template <> -__global__ void SqrtKernel(const half *input, half *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = hsqrt(input[i]); - } - return; -} -template -__global__ void RsqrtKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = rsqrtf(input[i]); - } - return; -} -template <> -__global__ void RsqrtKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = rsqrt(input[i]); - } - return; -} -template <> -__global__ void RsqrtKernel(const half *input, half *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = hrsqrt(input[i]); - } - return; -} -template -__global__ void SinKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = sinf(input[i]); - } - return; -} -template <> -__global__ void SinKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = sin(input[i]); - } - return; -} -template <> -__global__ void SinKernel(const half *input, half *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = hsin(input[i]); - } - return; -} -template -__global__ void AsinKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = asinf(input[i]); - } - return; -} -template <> -__global__ void AsinKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = asin(input[i]); - } - return; -} -template -__global__ void AsinhKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = asinhf(input[i]); - } - return; -} -template <> -__global__ void AsinhKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = asinh(input[i]); - } - return; -} -template -__global__ void CosKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = cosf(input[i]); - } - return; -} -template <> -__global__ void CosKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = cos(input[i]); - } - return; -} -template <> -__global__ void CosKernel(const half *input, half *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = hcos(input[i]); - } - return; -} -template -__global__ void ACosKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = acosf(input[i]); - } - return; -} -template <> -__global__ void ACosKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = acos(input[i]); - } - return; -} -template -__global__ void AcoshKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = acoshf(input[i]); - } - return; -} -template <> -__global__ void AcoshKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = acosh(input[i]); - } - return; -} -template -__global__ void AtanKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = atanf(input[i]); - } - return; -} -template <> -__global__ void AtanKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = atan(input[i]); - } - return; -} -template -__global__ void AbsKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = abs(input[i]); - } - return; -} -template <> -__global__ void AbsKernel(const half *input, half *output, const size_t count) { - half zero = 0.0; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = input[i] < zero ? -input[i] : input[i]; - } - return; -} -template -__global__ void AbsKernel(const Complex *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = abs(input[i]); - } - return; -} -template -__global__ void RealKernel(const Complex *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = input[i].real(); - } - return; -} -template -__global__ void RealKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = input[i]; - } - return; -} -template -__global__ void ImagKernel(const Complex *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = input[i].imag(); - } - return; -} -template -__global__ void ImagKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - T zero = 0; - output[i] = zero; - } - return; -} -template -__global__ void ConjKernel(const Complex *input, Complex *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = Complex(input[i].real(), -input[i].imag()); - } - return; -} -template -__global__ void ConjKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = input[i]; - } - return; -} -template -__global__ void FloorKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = floorf(input[i]); - } - return; -} -template <> -__global__ void FloorKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = floor(input[i]); - } - return; -} -template <> -__global__ void FloorKernel(const half *input, half *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = hfloor(input[i]); - } - return; -} -template -__global__ void RintKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = rintf(input[i]); - } - return; -} -template <> -__global__ void RintKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = rint(input[i]); - } - return; -} -template <> -__global__ void RintKernel(const half *input, half *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = hrint(input[i]); - } - return; -} -template -__global__ void RoundKernel(const T *input, T *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = nearbyintf(input[i]); - } - return; -} -template <> -__global__ void RoundKernel(const double *input, double *output, const size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = nearbyint(input[i]); - } - return; -} -template -__global__ void SignKernel(const T *input, T *output, const size_t count) { - T zero = 0.0; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - T res; - if (input[i] < zero) { - res = -1; - } else if (input[i] > zero) { - res = 1; - } else { - res = 0; - } - output[i] = static_cast(res); - } - return; -} -template -void Exponential(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - ExponentialKernel<<>>(input, output, count); - return; -} -template -void Expm1(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - Expm1Kernel<<>>(input, output, count); - return; -} -template -void Logarithm(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - LogarithmKernel<<>>(input, output, count); - return; -} -template -void Log1p(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - Log1pKernel<<>>(input, output, count); - return; -} -template -void Erf(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - ErfKernel<<>>(input, output, count); - return; -} -template -void Erfc(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - ErfcKernel<<>>(input, output, count); - return; -} -template -void Negative(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - NegativeKernel<<>>(input, output, count); - return; -} -template -void Reciprocal(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - ReciprocalKernel<<>>(input, output, count); - return; -} -template -void Square(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - SquareKernel<<>>(input, output, count); - return; -} -template -void Pow(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - PowKernel<<>>(input, output, count); - return; -} -template -void Sqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - SqrtKernel<<>>(input, output, count); - return; -} -template -void Sin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - SinKernel<<>>(input, output, count); - return; -} -template -void Cos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - CosKernel<<>>(input, output, count); - return; -} -template -void Asin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - AsinKernel<<>>(input, output, count); - return; -} -template -void ACos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - ACosKernel<<>>(input, output, count); - return; -} -template -void Atan(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - AtanKernel<<>>(input, output, count); - return; -} -template -void Asinh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - AsinhKernel<<>>(input, output, count); - return; -} -template -void Acosh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - AcoshKernel<<>>(input, output, count); - return; -} -template -void Rsqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - RsqrtKernel<<>>(input, output, count); - return; -} -template -void Abs(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - AbsKernel<<>>(input, output, count); - return; -} -template -void Abs(const Complex *input, T *output, const size_t count, cudaStream_t cuda_stream) { - AbsKernel<<>>(input, output, count); - return; -} -template -void Real(const Complex *input, T *output, const size_t count, cudaStream_t cuda_stream) { - RealKernel<<>>(input, output, count); - return; -} -template -void Real(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - RealKernel<<>>(input, output, count); - return; -} -template -void Imag(const Complex *input, T *output, const size_t count, cudaStream_t cuda_stream) { - ImagKernel<<>>(input, output, count); - return; -} -template -void Imag(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - ImagKernel<<>>(input, output, count); - return; -} -template -void Conj(const Complex *input, Complex *output, const size_t count, cudaStream_t cuda_stream) { - ConjKernel<<>>(input, output, count); - return; -} -template -void Conj(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - ConjKernel<<>>(input, output, count); - return; -} -template -void Floor(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - FloorKernel<<>>(input, output, count); - return; -} -template -void Rint(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - RintKernel<<>>(input, output, count); - return; -} -template -void Round(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - RoundKernel<<>>(input, output, count); - return; -} -template -void Sign(const T *input, T *output, const size_t count, cudaStream_t cuda_stream) { - SignKernel<<>>(input, output, count); - return; -} - -// double -template void Exponential(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Expm1(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Logarithm(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Log1p(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Erf(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Erfc(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Negative(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Reciprocal(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Square(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Sqrt(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Sin(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Cos(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Asin(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void ACos(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Atan(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Asinh(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Acosh(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Rsqrt(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Abs(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Floor(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Rint(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Round(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Sign(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Real(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const double *input, double *output, const size_t count, cudaStream_t cuda_stream); - - -// float -template void Exponential(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Expm1(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Logarithm(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Log1p(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Erf(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Erfc(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Negative(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Reciprocal(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Square(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Sqrt(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Sin(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Cos(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Asin(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void ACos(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Atan(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Asinh(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Acosh(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Rsqrt(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Abs(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Floor(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Rint(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Round(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Sign(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Real(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const float *input, float *output, const size_t count, cudaStream_t cuda_stream); - -// half -template void Exponential(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Expm1(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Logarithm(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Log1p(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Erf(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Erfc(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Negative(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Reciprocal(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Square(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Sqrt(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Sin(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Cos(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Asin(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void ACos(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Atan(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Asinh(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Acosh(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Rsqrt(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Abs(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Floor(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Rint(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Round(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Sign(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Real(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const half *input, half *output, const size_t count, cudaStream_t cuda_stream); - -// int8 -template void Exponential(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Expm1(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Logarithm(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Log1p(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Erf(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Erfc(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Negative(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Reciprocal(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Square(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Sqrt(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Sin(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Cos(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Asin(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void ACos(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Atan(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Asinh(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Acosh(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Rsqrt(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Abs(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Floor(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Rint(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Round(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Sign(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Real(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const char *input, char *output, const size_t count, cudaStream_t cuda_stream); - -// uint8 -template void Exponential(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Expm1(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Logarithm(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Log1p(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Erf(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Erfc(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Negative(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Reciprocal(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Square(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Sqrt(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Sin(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Cos(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Asin(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void ACos(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Atan(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Asinh(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Acosh(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Rsqrt(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Abs(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Floor(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Rint(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Round(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Sign(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Real(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Imag(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); -template void Conj(const unsigned char *input, unsigned char *output, const size_t count, - cudaStream_t cuda_stream); - -// int32 -template void Exponential(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Expm1(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Logarithm(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Log1p(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Erf(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Erfc(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Negative(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Reciprocal(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Square(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Sqrt(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Sin(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Cos(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Asin(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void ACos(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Atan(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Asinh(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Acosh(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Rsqrt(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Abs(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Floor(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Rint(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Round(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Sign(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Real(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const int *input, int *output, const size_t count, cudaStream_t cuda_stream); - -// complex64 -template void Real(const Complex *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const Complex *input, float *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const Complex *input, Complex *output, const size_t count, - cudaStream_t cuda_stream); - -// complex128 -template void Real(const Complex *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const Complex *input, double *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const Complex *input, Complex *output, const size_t count, - cudaStream_t cuda_stream); - -// bool -template void Real(const bool *input, bool *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const bool *input, bool *output, const size_t count, cudaStream_t cuda_stream); - -// int16 -template void Real(const int16_t *input, int16_t *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const int16_t *input, int16_t *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const int16_t *input, int16_t *output, const size_t count, cudaStream_t cuda_stream); - -// uint16 -template void Real(const uint16_t *input, uint16_t *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const uint16_t *input, uint16_t *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const uint16_t *input, uint16_t *output, const size_t count, cudaStream_t cuda_stream); - -// uint32 -template void Real(const uint32_t *input, uint32_t *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const uint32_t *input, uint32_t *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const uint32_t *input, uint32_t *output, const size_t count, cudaStream_t cuda_stream); - -// int64 -template void Real(const int64_t *input, int64_t *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const int64_t *input, int64_t *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const int64_t *input, int64_t *output, const size_t count, cudaStream_t cuda_stream); - -// uint64 -template void Real(const uint64_t *input, uint64_t *output, const size_t count, cudaStream_t cuda_stream); -template void Imag(const uint64_t *input, uint64_t *output, const size_t count, cudaStream_t cuda_stream); -template void Conj(const uint64_t *input, uint64_t *output, const size_t count, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh deleted file mode 100755 index 346f7bcdf50..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Copyright 2019-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_ - -#include "plugin/device/gpu/hal/device/cuda_common.h" -#include "utils/complex.h" -template -void Exponential(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Expm1(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Logarithm(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Log1p(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Erf(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Erfc(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Negative(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Reciprocal(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Square(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Sqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Rsqrt(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Sin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Cos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Asin(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void ACos(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Atan(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Asinh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Acosh(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Abs(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Floor(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Rint(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Round(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Sign(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Real(const Complex *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Real(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Imag(const Complex *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Imag(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -template -void Conj(const Complex *input, Complex *output, const size_t count, cudaStream_t cuda_stream); -template -void Conj(const T *input, T *output, const size_t count, cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cuh deleted file mode 100644 index 0432e0ac384..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_ -#include -#include "plugin/device/gpu/hal/device/cuda_common.h" - -template -void CalUniformCandidateSampler(const int64_t true_size, const int64_t num_sampled, const S prob_val, - S *true_expected_count, S *sampled_expected_count, cudaStream_t cuda_stream); - -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_UNIFORM_CANDIDATE_SAMPLER_IMPL_CUH_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cuh deleted file mode 100644 index 92dc4740df7..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unique_impl.cuh +++ /dev/null @@ -1,22 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_UNIQUE_H_ -#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_UNIQUE_H_ -template -int CalUnique(const T *input, int num_elements, S *input_index, S *sorted_index, T *output, S *index, - cudaStream_t cuda_stream); -#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_UNIQUE_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cu deleted file mode 100755 index 8af59861828..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unpack.cu +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include "plugin/device/gpu/kernel/cuda_impl/unpack.cuh" -template -__global__ void Unpack(const size_t size, const size_t output_num, - const size_t dims_after_axis, T** outputs, const T* input) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { - size_t cur_input_index = pos / dims_after_axis % output_num; - size_t cycle_len = output_num * dims_after_axis; - size_t local_index = pos / cycle_len * dims_after_axis + pos % cycle_len % dims_after_axis; - outputs[cur_input_index][local_index] = input[pos]; - } - return; -} - -template -void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, T** outputs, const T* input, - cudaStream_t cuda_stream) { - Unpack<<>>(size, output_num, - dims_after_axis, outputs, input); - return; -} - -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, int8_t** outputs, const int8_t* input, - cudaStream_t cuda_stream); -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, int16_t** outputs, const int16_t* input, - cudaStream_t cuda_stream); -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, int** outputs, const int* input, - cudaStream_t cuda_stream); -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, int64_t** outputs, const int64_t* input, - cudaStream_t cuda_stream); -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, uint8_t** outputs, const uint8_t* input, - cudaStream_t cuda_stream); -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, uint16_t** outputs, const uint16_t* input, - cudaStream_t cuda_stream); -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, uint32_t** outputs, const uint32_t* input, - cudaStream_t cuda_stream); -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, uint64_t** outputs, const uint64_t* input, - cudaStream_t cuda_stream); -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, half** outputs, const half* input, - cudaStream_t cuda_stream); -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, float** outputs, const float* input, - cudaStream_t cuda_stream); -template void UnpackKernel(const size_t size, const size_t output_num, - const size_t dims_after_axis, bool** outputs, const bool* input, - cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cu deleted file mode 100644 index 0f37245f60d..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cu +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/util.cuh" - -template -__global__ void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - T* input_addr, S* ids_addr, T* output_addr) { - for (int input_index = blockIdx.x * blockDim.x + threadIdx.x; input_index < input_dim0 * input_dim1; - input_index += blockDim.x * gridDim.x) { - size_t j = input_index / input_dim1; - size_t k = input_index % input_dim1; - - S i = ids_addr[j]; - if (i < 0 || i >= output_dim0) { - continue; - } - size_t output_index = i * output_dim1 + k; - MsAtomicAdd(output_addr + output_index, input_addr[input_index]); - } -} - -template -void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - T* input_addr, S* ids_addr, T* output_addr, cudaStream_t stream) { - int size = input_dim0 * input_dim1; - UnsortedSegmentSum<<>>(input_dim0, input_dim1, - output_dim0, output_dim1, input_addr, ids_addr, output_addr); - return; -} - -template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - double* input_addr, int* ids_addr, double* output_addr, cudaStream_t stream); -template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - double* input_addr, int64_t* ids_addr, double* output_addr, cudaStream_t stream); - -template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - float* input_addr, int* ids_addr, float* output_addr, cudaStream_t stream); -template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - float* input_addr, int64_t* ids_addr, float* output_addr, cudaStream_t stream); - -template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - half* input_addr, int* ids_addr, half* output_addr, cudaStream_t stream); -template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - half* input_addr, int64_t* ids_addr, half* output_addr, cudaStream_t stream); - -template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - int* input_addr, int* ids_addr, int* output_addr, cudaStream_t stream); -template void UnsortedSegmentSum(size_t input_dim0, size_t input_dim1, size_t output_dim0, size_t output_dim1, - int* input_addr, int64_t* ids_addr, int* output_addr, cudaStream_t stream); - - - diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel_factory.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel_factory.cc index d4c22f699e6..22b67974a1a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel_factory.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel_factory.cc @@ -18,7 +18,7 @@ #include "utils/ms_utils.h" #include "runtime/device/kernel_info.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "kernel/common_utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/addn_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/addn_gpu_kernel.h index 97b13b8b379..631ad498e49 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/addn_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/addn_gpu_kernel.h @@ -22,7 +22,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/assign_add_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/assign_add_gpu_kernel.h index 3fa7f97df38..52388592dea 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/assign_add_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/assign_add_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/assign_add_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/assign_add_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_complex_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_complex_gpu_kernel.h index a74873cd924..febbe873e47 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_complex_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_complex_gpu_kernel.h @@ -24,7 +24,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" #include "backend/common/session/anf_runtime_algorithm.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h index 4ec4d963aa8..5b965853545 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" #include "backend/common/session/anf_runtime_algorithm.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_grad_gpu_kernel.h index 0db01af46bb..cdb0885abf1 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/broadcast_grad_gpu_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_grad_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" #include "backend/common/session/anf_runtime_algorithm.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cast_all_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cast_all_gpu_kernel.h index 76ae9eec4e5..2c335613362 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cast_all_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cast_all_gpu_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/cast_all_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_all_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_gpu_kernel.h index 09ce07ecc10..7da6c9b7241 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_gpu_kernel.h @@ -21,9 +21,9 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/eye_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_solve_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_solve_gpu_kernel.h index da5be233b06..d6af6258a42 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_solve_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_solve_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_trsm_solve_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_trsm_solve_gpu_kernel.h index 00a2975f149..603be4333ea 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_trsm_solve_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cholesky_trsm_solve_gpu_kernel.h @@ -21,8 +21,8 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/eye_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/matrix_split_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/eye_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_split_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumprod_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumprod_gpu_kernel.h index 5eb46717038..78dae0b7918 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumprod_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumprod_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/cumprod_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumprod_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumsum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumsum_gpu_kernel.h index 8e0bcb52731..cb2887af803 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumsum_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/cumsum_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/determinant_triangle_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/determinant_triangle_gpu_kernel.h index 2d94bc31184..824482d83e6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/determinant_triangle_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/determinant_triangle_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/determinant_triangle_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/determinant_triangle_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_c_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_c_gpu_kernel.h index cb5c9d661a1..4463024b463 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_c_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_c_gpu_kernel.h @@ -25,15 +25,15 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" #include "utils/convert_utils.h" -#include "utils/complex.h" -#include "plugin/device/gpu/kernel/cuda_impl/real_to_complex_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/real_to_complex_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_gpu_kernel.h index 98075069f27..6c4437931ba 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/eigh_gpu_kernel.h @@ -24,13 +24,13 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/triangle_matrix_copy_impl.cuh" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/triangle_matrix_copy_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" #include "utils/convert_utils.h" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/einsum_helper.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/einsum_helper.h index 1c9aa4ab6be..d62c649bc84 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/einsum_helper.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/einsum_helper.h @@ -28,9 +28,9 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/einsum_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/einsum_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/equalcount_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/equalcount_gpu_kernel.h index 5029e91941c..8acd40ebcbd 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/equalcount_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/equalcount_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/equalcount_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/equalcount_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/float_status_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/float_status_gpu_kernel.h index 758b3148827..a789bfc9ccf 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/float_status_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/float_status_gpu_kernel.h @@ -23,8 +23,8 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/float_status_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/float_status_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/index_add_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/index_add_gpu_kernel.h index 22ce1d636f8..706d17c2fea 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/index_add_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/index_add_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/index_add_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/index_add_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/linspace.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/linspace.h index de37b2896c9..738d602dedd 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/linspace.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/linspace.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/linspace.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/linspace.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/logical_not_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/logical_not_gpu_kernel.h index 915f3e178b8..82709487c0b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/logical_not_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/logical_not_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/logical_not_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/logical_not_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "utils/convert_utils.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.h index 75139d839bd..08bd7ec2fe1 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_gpu_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" #include "utils/convert_utils.h" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_solve_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_solve_gpu_kernel.h index e8a5b0e3388..4f11a03c568 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_solve_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/lu_solve_gpu_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" #include "utils/convert_utils.h" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/matmul_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/matmul_gpu_kernel.h index 5b78d5a0c03..c6114592578 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/matmul_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/matmul_gpu_kernel.h @@ -24,7 +24,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/fill_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fill_impl.cuh" #include "utils/convert_utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/multinomial_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/multinomial_gpu_kernel.h index b3eaadd590c..297a7e20b1d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/multinomial_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/multinomial_gpu_kernel.h @@ -26,8 +26,8 @@ #include "plugin/device/gpu/hal/device/gpu_memory_allocator.h" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/multinomial_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/cumsum_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/multinomial_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cumsum_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/nms_with_mask_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/nms_with_mask_gpu_kernel.h index d5e2fcd7c70..521317e8963 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/nms_with_mask_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/nms_with_mask_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/nms_with_mask_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/nms_with_mask_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/random_op_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/random_op_gpu_kernel.h index 6b85adafe9b..810c1a28d86 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/random_op_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/random_op_gpu_kernel.h @@ -24,7 +24,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/random_op_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_op_impl.cuh" #include "include/curand.h" #include "utils/ms_context.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/solve_triangular_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/solve_triangular_gpu_kernel.h index 66ce1e64541..6093bd8c777 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/solve_triangular_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/solve_triangular_gpu_kernel.h @@ -24,7 +24,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/square_sum_all_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/square_sum_all_gpu_kernel.h index 90736d2be71..54e9c61330b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/square_sum_all_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/square_sum_all_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/square_sum_all_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/square_sum_all_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/squared_difference_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/squared_difference_kernel.h index 8c5e5c778dd..170a342e0ce 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/squared_difference_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/squared_difference_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.cc index 7a7b8653c80..b5d19e3974c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.cc @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "utils/complex.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" #include "plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_gpu_kernel.h index 04440f79f4b..d27110cf9b3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_gpu_kernel.h @@ -24,7 +24,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_grad_gpu_kernel.h index 01f390e530d..a12768c26ed 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_grad_gpu_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/unary_op_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/math/update_thor_gradient.h b/mindspore/ccsrc/plugin/device/gpu/kernel/math/update_thor_gradient.h index 4e64d99acd3..3e17aeeb6c3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/update_thor_gradient.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/update_thor_gradient.h @@ -23,8 +23,8 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/convert_gradient_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "utils/convert_utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_gpu_kernel.h index 43d9b03c421..a06f9cdd9fd 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_gpu_kernel.h @@ -25,7 +25,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" #include "utils/utils.h" -#include "plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_grad_gpu_kernel.h index 2f3ac96e6a4..d4d34fc69e8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/sync_batch_norm_grad_gpu_kernel.h @@ -25,7 +25,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" #include "utils/utils.h" -#include "plugin/device/gpu/kernel/cuda_impl/sync_batch_norm_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sync_batch_norm_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/activation_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/activation_gpu_kernel.h index 17cb6c616e5..0267002ca20 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/activation_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/activation_gpu_kernel.h @@ -23,7 +23,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adagrad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adagrad_gpu_kernel.h index 9cf9ee60322..34b2471bbf2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adagrad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adagrad_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_gpu_kernel.h index 1859fc83329..e3c9a9cfdf3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh" namespace mindspore { namespace kernel { constexpr size_t INPUT_NUM = 10; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_weight_decay_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_weight_decay_gpu_kernel.h index 59cadec12c8..c2260757903 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_weight_decay_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adam_weight_decay_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh" namespace mindspore { namespace kernel { constexpr size_t INPUT_NUM = 9; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_gpu_kernel.h index 758d2042719..cab901423cb 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_grad_gpu_kernel.h index 0ae0cc8fe6f..adc7b8f67cc 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/adaptive_avg_pool2d_grad_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/apply_gradient_descent_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/apply_gradient_descent_gpu_kernel.h index 7be111f46f2..a61fec56d51 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/apply_gradient_descent_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/apply_gradient_descent_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h index 9c7d6dbe5fb..a9d4dc3ff82 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h @@ -24,7 +24,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/batchnorm_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bce_with_logits_loss_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bce_with_logits_loss_kernel.h index f699b98091c..4c210ba30b2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bce_with_logits_loss_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bce_with_logits_loss_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/bce_with_logits_loss_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bias_add_grad_gpu_kenel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bias_add_grad_gpu_kenel.h index a631f60c7d2..1f2d2be7e28 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bias_add_grad_gpu_kenel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/bias_add_grad_gpu_kenel.h @@ -24,7 +24,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/bias_add_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_gpu_kernel.h index 1453a1b3b9d..8b12f8e0cc4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh" #include "kernel/common_utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_grad_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_grad_kernel.h index f8679afdade..d28c1b9370d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_grad_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/binary_cross_entropy_grad_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh" #include "kernel/common_utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/clip_grad_norm_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/clip_grad_norm_gpu_kernel.h index e151dc6d618..f73a5337ede 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/clip_grad_norm_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/clip_grad_norm_gpu_kernel.h @@ -24,7 +24,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/clip_grad_norm_impl.cuh" namespace mindspore::kernel { constexpr size_t kArgMaxDim = 7; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/combine_momentum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/combine_momentum_gpu_kernel.h index 4e162839f86..7652713b5f6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/combine_momentum_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/combine_momentum_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_gpu_kernel.h index ea1ef4b8e01..b107df67ece 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_filter_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_filter_gpu_kernel.h index e2f6aafd961..ab5ad4245dd 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_filter_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_filter_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_input_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_input_gpu_kernel.h index 7047b900dc5..3b5660483b8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_input_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_input_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_gpu_kernel.h index 82083b5cd66..9c38cf96184 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_filter_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_filter_gpu_kernel.h index d7127c2d4ba..a47db13b235 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_filter_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_filter_gpu_kernel.h @@ -21,11 +21,11 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_input_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_input_gpu_kernel.h index cb325b0d951..28e887bdf71 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_input_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_grad_input_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_transpose_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_transpose_gpu_kernel.h index 795ea225198..b47f90f2e8e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_transpose_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv3d_transpose_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ctcloss_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ctcloss_gpu_kernel.h index 3a415c791b8..1e4948c1b37 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ctcloss_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ctcloss_gpu_kernel.h @@ -24,7 +24,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/hal/device/gpu_memory_allocator.h" -#include "plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/ctcloss_impl.cuh" namespace mindspore { namespace kernel { constexpr size_t kPrevOutput0th = 0; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout3d_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout3d_gpu_kernel.h index dc517b7df10..f922b08c458 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout3d_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout3d_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/dropout3d_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout3d_impl.cuh" #include "include/curand.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_gpu_kernel.h index ec803510ed9..717678c10cb 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/dropout_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cuh" #include "include/curand.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_grad_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_grad_kernel.h index 21993303113..f29a282d694 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_grad_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/dropout_grad_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/dropout_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dropout_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ftrl_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ftrl_gpu_kernel.h index b6e00a6fe66..381a2f6fe1e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ftrl_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ftrl_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/ftrl_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/ftrl_impl.cuh" namespace mindspore { namespace kernel { constexpr size_t INPUT_NUM = 8; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_adam_weight_decay.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_adam_weight_decay.h index 8feb0255f65..8e95bcf250f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_adam_weight_decay.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_adam_weight_decay.h @@ -21,7 +21,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/adam_weight_decay_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_grad_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_grad_v2_gpu_kernel.h index 12c4d3ff358..32bcc041b97 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_grad_v2_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_grad_v2_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_v2_gpu_kernel.h index 015395292e0..c258eca61c2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_v2_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_add_relu_v2_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_scale_momentum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_scale_momentum_gpu_kernel.h index b5d06a0f7bf..9579ad694f0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_scale_momentum_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_scale_momentum_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh" namespace mindspore { namespace kernel { constexpr size_t INPUT_NUM = 6; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_momentum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_momentum_gpu_kernel.h index f67bba6eecb..5ce11019c63 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_momentum_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_momentum_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh" namespace mindspore { namespace kernel { constexpr size_t INPUT_NUM = 6; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_scale_momentum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_scale_momentum_gpu_kernel.h index 06cc7d9bee4..e86efa29e10 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_scale_momentum_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/fused_weightdecay_scale_momentum_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh" namespace mindspore { namespace kernel { constexpr size_t INPUT_NUM = 7; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_grad_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_grad_kernel.h index 7a1edeeee0d..1080e6f9423 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_grad_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_grad_kernel.h @@ -21,7 +21,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_kernel.h index 62554dc8f53..b37c8abc739 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/gelu_kernel.h @@ -21,7 +21,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/gelu_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gelu_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_gpu_kernel.h index f5072507216..39991192200 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_grad_gpu_kernel.h index 1bae26179d9..e5e672d8141 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hsigmoid_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/hsigmoid_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hsigmoid_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_gpu_kernel.h index 4123ff2a091..01b248e015a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_grad_gpu_kernel.h index d772c1cda4d..2c6e4535a2d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/hswish_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/hswish_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hswish_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/im2col_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/im2col_gpu_kernel.h index 177f811f3ee..1730617aac6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/im2col_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/im2col_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_gpu_kernel.h index d3039b119b5..759b319c067 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_gpu_kernel.h @@ -23,7 +23,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" #include "utils/utils.h" -#include "plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_grad_gpu_kernel.h index 812edbdf594..c9e9a9775ff 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/instance_norm_grad_gpu_kernel.h @@ -24,7 +24,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/instance_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/instance_norm_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_gpu_kernel.h index a2c1aa81efc..75f4fb23d3b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh" #include "kernel/common_utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_grad_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_grad_kernel.h index c5125546430..03ce54b2374 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_grad_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/kl_div_loss_grad_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh" #include "kernel/common_utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2_loss_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2_loss_gpu_kernel.h index 121685e9f11..fab679c2ea4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2_loss_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2_loss_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/l2_loss.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2_loss.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_gpu_kernel.h index d10b352a5da..9fe274988c0 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_gpu_kernel.h @@ -22,8 +22,8 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_grad_gpu_kernel.h index 59f6bd61b70..9bc08a94ace 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/l2normalize_grad_gpu_kernel.h @@ -22,8 +22,8 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/l2normalize_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/l2normalize_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_gpu_kernel.h index 6791f6d1170..b39a97bc122 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_gpu_kernel.h index 134c5699f35..7d1790adccc 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_grad_gpu_kernel.h index 4ddd774e096..e64fd9b3058 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/layer_norm_grad_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/layer_norm_grad_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/layer_norm_grad_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_gpu_kernel.h index b9bc6a4987b..eb9918ffaf2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_gpu_kernel.h @@ -22,8 +22,8 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh" #include "utils/utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_grad_gpu_kernel.h index 7dde87fc38b..c0e0bd481b8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/local_response_norm_grad_gpu_kernel.h @@ -22,8 +22,8 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/local_response_norm_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/local_response_norm_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh" #include "utils/utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_gpu_kernel.h index 659aa168081..458b75890ce 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_grad_gpu_kernel.h index d263c02f91b..065312c5d93 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/maxpool_with_argmax_grad_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/maxpool_with_argmax_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/maxpool_with_argmax_grad_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_gpu_kernel.h index 79c07c84d66..efb3a8f08b2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_grad_gpu_kernel.h index 06d8d715684..c7d837fa2b4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/mirror_pad_grad_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/mirror_pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/mirror_pad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/momentum_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/momentum_gpu_kernel.h index 03e667c944d..3673fb93302 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/momentum_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/momentum_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/momentum_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/momentum_impl.cuh" namespace mindspore { namespace kernel { constexpr size_t INPUT_NUM = 5; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_gpu_kernel.h index fe5c27f1959..e1641a8f3d6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh" #include "kernel/common_utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_grad_gpu_kernel.h index c5533e4fb24..45193109ad8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/nll_loss_grad_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/loss_with_reduction_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/loss_with_reduction_impl.cuh" #include "kernel/common_utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pad_gpu_kernel.h index 972902526be..f1aa84afe74 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pad_gpu_kernel.h @@ -22,8 +22,8 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_gpu_kernel.h index cc5f2e4a8f7..ff1286d9161 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_grad_gpu_kernel.h index 0efd058fc8c..960db387035 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/pooling_grad_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/pad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_gpu_kernel.h index 6c16d19b6b9..aa1d0743992 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_gpu_kernel.h @@ -23,7 +23,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/prelu_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_grad_gpu_kernel.h index 373bc58aa8f..cc3fbe80f5a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/prelu_grad_gpu_kernel.h @@ -23,7 +23,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/prelu_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/prelu_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_gpu_kernel.h index 34f0db1d592..4998d402856 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_grad_gpu_kernel.h index 8a0d4a86eba..9a481ae8515 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/ps_roi_pooling_grad_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/psroi_pooling_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/psroi_pooling_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_gpu_kernel.h index 6573863641d..51e700b3032 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_gpu_kernel.h index ce5852877f0..4022810d7dd 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_gpu_kernel.h @@ -23,7 +23,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/relu_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_v2_gpu_kernel.h index c3b9708d26a..d9e0395bcab 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_v2_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_grad_v2_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_v2_gpu_kernel.h index 72be64cf00f..5eae299261b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_v2_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/relu_v2_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/relu_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/relu_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_gpu_kernel.h index 5059dbd3422..eb3855ff0e5 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_grad_gpu_kernel.h index 28f39509f1b..dd83598bce8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/resize_bilinear_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/resize_bilinear_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_bilinear_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/rmsprop_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/rmsprop_gpu_kernel.h index 9ff5d483084..28d8b6ca0ae 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/rmsprop_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/rmsprop_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/rmsprop_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/rmsprop_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_gpu_kernel.h index 53f631a1da1..52b08df28d3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_grad_gpu_kernel.h index 7578704d7bb..f655555a152 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/roi_align_grad_gpu_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/roi_align_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/roi_align_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sgd_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sgd_gpu_kernel.h index 82c9f815f31..e78a7c82b20 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sgd_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sgd_gpu_kernel.h @@ -18,7 +18,7 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_NN_SGD_KERNEL_H_ #include -#include "plugin/device/gpu/kernel/cuda_impl/sgd_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sgd_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h index f2491bfae49..70f6873cc58 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h index e92086d25f3..31f51d8a64c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sigmoid_cross_entropy_with_logits_grad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_gpu_kernel.h index 4d227013cd0..abf91d8fa05 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_grad_gpu_kernel.h index 57b81854cf6..a119076e0ff 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/smooth_l1_loss_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/smooth_l1_loss_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/smooth_l1_loss_impl.cuh" namespace mindspore { namespace kernel { template diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_cross_entropy_with_logits_gpu_kernel.h index f712642eb47..682036ae87c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_cross_entropy_with_logits_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_cross_entropy_with_logits_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_gpu_kernel.h index 92bec8ac22a..a05858aec7c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_gpu_kernel.h @@ -23,7 +23,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_grad_gpu_kernel.h index dad15533522..188059f7ab2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softmax_grad_gpu_kernel.h @@ -23,7 +23,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_gpu_kernel.h index 4fab63e732f..a33f0f23326 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_gpu_kernel.h @@ -21,7 +21,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_grad_gpu_kernel.h index 06abc62edd5..887ba91629e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/softplus_grad_gpu_kernel.h @@ -21,7 +21,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/softplus_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/softplus_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_apply_proximal_adagrad_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_apply_proximal_adagrad_kernel.h index b0c8c69ef0e..2916745ee4b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_apply_proximal_adagrad_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_apply_proximal_adagrad_kernel.h @@ -25,7 +25,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/sparse_apply_proximal_adagrad_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_apply_proximal_adagrad_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_ftrl_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_ftrl_gpu_kernel.h index ded51c00bb3..990127a0605 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_ftrl_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_ftrl_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/sparse_ftrl_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_ftrl_impl.cuh" namespace mindspore { namespace kernel { constexpr size_t INPUT_NUM = 5; diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h index d5fc085fc9b..1f467e88714 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cross_entropy_impl.cuh" #include "plugin/device/gpu/kernel/kernel_constants.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_decode_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_decode_gpu_kernel.h index 2b01569cf25..390de32af9b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_decode_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_decode_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include #include -#include "plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_decode_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_encode_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_encode_gpu_kernel.h index 28559c72e67..d89f80be69f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_encode_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/boundingbox_encode_gpu_kernel.h @@ -19,7 +19,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/boundingbox_encode_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/check_valid_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/other/check_valid_gpu_kernel.h index 77957902c5e..8dfa8e82cd9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/check_valid_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/check_valid_gpu_kernel.h @@ -19,7 +19,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/check_valid_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/check_valid_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.h index b19bf3b8b8f..7a9a7ac6e69 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.h @@ -24,7 +24,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_stitch_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_stitch_gpu_kernel.cc index 06c0b8fb136..b846f51d774 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_stitch_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_stitch_gpu_kernel.cc @@ -16,7 +16,7 @@ #include "plugin/device/gpu/kernel/other/dynamic_stitch_gpu_kernel.h" #include #include "kernel/common_utils.h" -#include "plugin/device/gpu/kernel/cuda_impl/dynamic_stitch_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_stitch_impl.cuh" #include "plugin/device/gpu/hal/device/gpu_common.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/iou_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/other/iou_gpu_kernel.h index 9aa4339681e..3c09ddee697 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/iou_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/iou_gpu_kernel.h @@ -19,7 +19,7 @@ #include #include -#include "plugin/device/gpu/kernel/cuda_impl/iou_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/iou_impl.cuh" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_gpu_kernel.h index e1a0c4ea144..ffef444f504 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_grad_gpu_kernel.h index 0c0c0b93df8..92c8ef0f649 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold2_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_gpu_kernel.h index 840c1d408df..f24ff530b6d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_gpu_kernel.h @@ -21,7 +21,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/kernel_constants.h" -#include "plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_grad_gpu_kernel.h index 3c10b80cb17..8ba620b24a7 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/batchnorm_fold_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_gpu_kernel.h index 002ec255724..b17888a5d32 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.cc index 2d18708d4ff..5befe79234a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.cc @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.h index 366738ae507..996e26891c6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/correction_mul_grad_gpu_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc index 3bf75434ba9..e6e9fef93a2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh" #include #include #include diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.cc index cc62653da44..704ee886b16 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.cc @@ -14,7 +14,7 @@ * limitations under the License. */ #include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.cc index bbab3696446..74cca389fc4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.cc @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh" #include #include #include diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.cc index 89bf52e91c6..30432bab050 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.cc @@ -14,7 +14,7 @@ * limitations under the License. */ #include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.cc index 5556795993f..3d907e915aa 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.cc @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh" #include #include #include diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.cc index d16860c541c..f2055b4e923 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.cc @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/fake_quant_perchannel_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.cc index 5345d19a92a..514da145d36 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.cc @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh" #include #include #include diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.cc index b7d27020ad5..6259e78ad0a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.cc @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/fake_quant_perlayer_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.cc index 5bc204ebbdb..0e387af2b73 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.cc @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh" #include #include #include diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.cc index 1adc65610de..08b6d8d6c66 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.cc @@ -15,7 +15,7 @@ */ #include "plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.h" -#include "plugin/device/gpu/kernel/cuda_impl/minmax_update_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh" #include #include #include diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_categorical_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_categorical_gpu_kernel.h index baace853243..22b7789319a 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_categorical_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_categorical_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/random_categorical.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_categorical.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_choice_with_mask_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_choice_with_mask_gpu_kernel.h index f46d1eba0b3..7f290fbab07 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_choice_with_mask_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/random/random_choice_with_mask_gpu_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/random_choice_with_mask_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/random_choice_with_mask_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/random/uniform_candidate_sampler_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/random/uniform_candidate_sampler_gpu_kernel.h index 638783918c3..0b18458243f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/random/uniform_candidate_sampler_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/random/uniform_candidate_sampler_gpu_kernel.h @@ -25,7 +25,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/uniform_candidate_sampler_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/uniform_candidate_sampler_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/buffer_sample_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/buffer_sample_gpu_kernel.cc index f94e2c8478e..d3ee193db2d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/buffer_sample_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/buffer_sample_gpu_kernel.cc @@ -21,7 +21,7 @@ #include "kernel/common_utils.h" #include "plugin/device/gpu/kernel/cuda_impl/rl/rl_buffer_impl.cuh" -#include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh" #include "plugin/device/gpu/hal/device/gpu_common.h" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/tag_environment.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/tag_environment.cc index 31049380c13..71b02676f8c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/tag_environment.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/tag_environment.cc @@ -22,7 +22,7 @@ #include #include #include "plugin/device/gpu/hal/device/cuda_driver.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_atom_energy_kernel.h index b6dcaba508a..a22dd0e6a9d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_atom_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_atom_energy_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_atom_energy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_energy_kernel.h index ac5155d7c24..21daaea65db 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_energy_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_energy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_kernel.h index 5afb3dc5066..b638c05b2f7 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_with_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_with_atom_energy_kernel.h index 62ec9169957..54e44c84521 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_with_atom_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/angle/angle_force_with_atom_energy_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/angle/angle_force_with_atom_energy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_atom_energy_cuda_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_atom_energy_cuda_gpu_kernel.h index 0dd63d6a938..0aa39729ae5 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_atom_energy_cuda_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_atom_energy_cuda_gpu_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_energy_cuda_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_energy_cuda_gpu_kernel.h index 96c2bbf7f14..5be844e02c9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_energy_cuda_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_energy_cuda_gpu_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_cuda_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_cuda_gpu_kernel.h index 871d8ac9d48..fcb51d8c4d2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_cuda_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_cuda_gpu_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h index 873c243450a..ae7e0d0572e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_kernel.h index c4f9f82f183..4c735ca7512 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_energy_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_virial_kernel.h index 102fd693d4a..55c1977aeca 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_virial_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/bond/bond_force_with_atom_virial_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/atomcrdtocv_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/atomcrdtocv_kernel.h index 7f577d7fa14..02b7550fca4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/atomcrdtocv_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/atomcrdtocv_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_kernel.h index f13ac4d4dab..9d0243d0977 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_quarter_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_quarter_kernel.h index 2d67f62a78b..935965cc1c3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_quarter_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/crd_to_uint_crd_quarter_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/crd_to_uint_crd_quarter_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/get_center_of_mass_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/get_center_of_mass_kernel.h index 5ad1021e99a..6c390793a89 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/get_center_of_mass_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/get_center_of_mass_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/get_center_of_mass_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/getcenter_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/getcenter_kernel.h index 38662c9dc8c..f89ca37698f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/getcenter_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/getcenter_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/getcenter_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/map_center_of_mass_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/map_center_of_mass_kernel.h index cc7019080a1..97dc17d5e58 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/map_center_of_mass_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/map_center_of_mass_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/map_center_of_mass_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/mdtemperature_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/mdtemperature_kernel.h index addec135971..f3f8339da2f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/mdtemperature_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/mdtemperature_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/mdtemperature_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/total_c6_get_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/total_c6_get_kernel.h index 3ce82aa1807..a1bcd4a1dde 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/total_c6_get_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/common/total_c6_get_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/common/total_c6_get_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/cal_no_wrap_crd_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/cal_no_wrap_crd_kernel.h index b5d370e3570..8a5cbda121e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/cal_no_wrap_crd_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/cal_no_wrap_crd_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/refresh_boxmaptimes_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/refresh_boxmaptimes_kernel.h index 88eb632e8f8..ba052a8f385 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/refresh_boxmaptimes_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/crdmcmap/refresh_boxmaptimes_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_atom_energy_kernel.h index 9dc3528d87d..94b6bd21e18 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_atom_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_atom_energy_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_energy_kernel.h index fd7c3e95ac7..a8a51293e04 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_energy_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_energy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_kernel.h index 926a8a3b314..d07471ee7d5 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_with_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_with_atom_energy_kernel.h index b4b6aa337b6..4d4086517dc 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_with_atom_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/dihedral/dihedral_force_with_atom_energy_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/dihedral/dihedral_force_with_atom_energy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h index 2af64f733f9..4526cf2e1f6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h @@ -25,7 +25,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_energy_kernel.h index bcedc485c97..34fc479bb57 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_energy_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_energy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_kernel.h index e2c7e9ae6a2..ffd02e02fa8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_kernel.h index da2227ee73f..f5eba16255f 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_kernel.h @@ -21,7 +21,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h index a007d330eaa..388f5014082 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h @@ -25,7 +25,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_virial_energy_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_virial_energy_update_kernel.h index 0359ff31383..ac10ff932aa 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_virial_energy_update_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/lj/lj_force_with_virial_energy_update_kernel.h @@ -25,7 +25,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h index 92beeba773a..6c17cee4cdc 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_energy_kernel.h index adbe74c15bc..ef3bc821769 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_cf_energy_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h index 26b766a650f..51fd3e02153 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h index 00427d8ab4f..17865345593 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h index 3b7932b4d92..6e33ebf2561 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_energy_kernel.h index b44d8d66572..edd9c83d3d3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_energy_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h index 49b3cb13234..f51ebc7cc54 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h index ad8af605b8a..baa6a71b36d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h @@ -23,7 +23,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_kernel.h index 64e2d544a39..416017d726b 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_kernel.h @@ -27,7 +27,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_new_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_new_kernel.h index d0f2348a4d9..3ed8bf99cdd 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_new_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/neighbor_list/neighbor_list_update_new_kernel.h @@ -26,7 +26,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_gradient_descent_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_gradient_descent_kernel.h index 4fc6661f49b..3f0bca623a2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_gradient_descent_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_gradient_descent_kernel.h @@ -25,7 +25,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_kernel.h index c55aac62a07..720eb6c908c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_kernel.h @@ -28,7 +28,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h index f1cde2bfa0d..dc35a5f8b90 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h @@ -25,7 +25,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_with_max_vel_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_with_max_vel_kernel.h index e8ddd4ba75f..ecac60493cc 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_with_max_vel_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_with_max_vel_kernel.h @@ -25,7 +25,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_with_max_vel_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_with_max_vel_kernel.h index 15805f0a3de..cb03973e778 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_with_max_vel_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_leap_frog_with_max_vel_kernel.h @@ -25,7 +25,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_setup_random_state.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_setup_random_state.h index 6326d3eeca6..64277b9c221 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_setup_random_state.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/nvtit/md_iteration_setup_random_state.h @@ -25,7 +25,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/fft_3d_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/fft_3d_kernel.h index fd6c04d3353..6805d39935e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/fft_3d_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/fft_3d_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/fft_3d_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/ifft_3d_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/ifft_3d_kernel.h index c2c0e06144a..1072ad21413 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/ifft_3d_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/ifft_3d_kernel.h @@ -22,7 +22,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/ifft_3d_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_kernel.h index ad2e7ba0ae0..ef8193648c3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_kernel.h @@ -20,7 +20,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_update_kernel.h index 1c43ceeaf95..e611b79cfb1 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_update_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_energy_update_kernel.h @@ -24,7 +24,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_energy_update_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_kernel.h index d055a95aefb..2104c1c9b98 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_kernel.h @@ -24,7 +24,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_update_kernel.h index ee448d3d623..63f60e392d5 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_update_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_excluded_force_update_kernel.h @@ -25,7 +25,7 @@ #include #include #include -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh" diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_kernel.h index 0b4f9c235ab..353c3d9b7ca 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_kernel.h @@ -26,7 +26,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_update_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_update_kernel.h index 4f29259d8eb..d71cdadf6e1 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_update_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/pme/pme_reciprocal_force_update_kernel.h @@ -24,7 +24,7 @@ #include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" #include "plugin/device/gpu/kernel/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh" namespace mindspore { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_energy_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_energy_kernel.h index 87367c18e1f..441087ae770 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_energy_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_energy_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_atom_energy_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_atom_energy_virial_kernel.h index a854b1a1309..e2a6da86d99 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_atom_energy_virial_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_atom_energy_virial_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_kernel.h index ef162944f99..8e180f58254 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/restrain/restrain_force_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_kernel.h index 1758efd614a..078ae2c1a7d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.h index d29518fa01b..9b3402134f4 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_kernel.h index 236f5659c4d..672ba22f6c2 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_kernel.h @@ -30,7 +30,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_virial_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_virial_kernel.h index 9c04703f69e..983a233316d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_virial_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_force_virial_kernel.h @@ -29,7 +29,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_kernel.h index 4b1ff8a1a78..bdc2cf0f5bf 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/constrain_kernel.h @@ -29,7 +29,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/last_crd_to_dr_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/last_crd_to_dr_kernel.h index 589578a676e..990a6b7e99e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/last_crd_to_dr_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/last_crd_to_dr_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_crd_vel_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_crd_vel_kernel.h index 069b482d7e5..3d12554afa9 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_crd_vel_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_crd_vel_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_uint_crd_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_uint_crd_kernel.h index 37f45d544bf..f1d7f0ec3c3 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_uint_crd_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sponge/simple_constrain/refresh_uint_crd_kernel.h @@ -26,7 +26,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" -#include "plugin/device/gpu/hal/device/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" namespace mindspore { namespace kernel { diff --git a/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc b/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc index eb9687af39f..cd4632e1c9c 100644 --- a/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc +++ b/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc @@ -16,7 +16,7 @@ #include "ps/ps_cache/gpu/gpu_ps_cache.h" #include "ps/ps_cache/ps_cache_factory.h" -#include "plugin/device/gpu/kernel/cuda_impl/hash_impl.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/hash_impl.cuh" #include "plugin/device/gpu/hal/device/gpu_common.h" #include "plugin/device/gpu/hal/device/cuda_driver.h" #include "plugin/device/gpu/hal/device/gpu_memory_allocator.h" diff --git a/mindspore/python/mindspore/run_check/_check_version.py b/mindspore/python/mindspore/run_check/_check_version.py index 78ee448fed4..673f2b8c100 100644 --- a/mindspore/python/mindspore/run_check/_check_version.py +++ b/mindspore/python/mindspore/run_check/_check_version.py @@ -143,12 +143,12 @@ class GPUEnvChecker(EnvChecker): logger.warning(f"MindSpore version {__version__} and cudDNN version {cudnn_version} " "does not match, please refer to the installation guide for version matching " "information: https://www.mindspore.cn/install. The recommended version is " - "CUDA10.1 with cuDNN7.6.x and CUAD11.1 with cuDNN8.0.x") + "CUDA10.1 with cuDNN7.6.x and CUDA11.1 with cuDNN8.0.x") if cudnn_version and int(cudnn_version) < 800 and int(str(self.v).split('.')[0]) > 10: logger.warning(f"CUDA version {self.v} and cuDNN version {cudnn_version} " "does not match, please refer to the installation guide for version matching " "information: https://www.mindspore.cn/install. The recommended version is " - "CUAD11.1 with cuDNN8.0.x") + "CUDA11.1 with cuDNN8.0.x") def _check_version(self): """Check cuda version""" diff --git a/tests/ut/cpp/base/complex_test.cc b/tests/ut/cpp/base/complex_test.cc index 634555eed33..14a61c71548 100644 --- a/tests/ut/cpp/base/complex_test.cc +++ b/tests/ut/cpp/base/complex_test.cc @@ -16,7 +16,7 @@ #include #include "common/common_test.h" -#include "utils/complex.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" namespace mindspore {