[OpenMP][Clang][NVPTX] Only build one bitcode library for each SM

In D97003, CUDA 9.2 is the minimum requirement for OpenMP offloading on NVPTX target. We don't need to have macros in source code to select right functions based on CUDA version. we don't need to compile multiple bitcode libraries of different CUDA versions for each SM. We don't need to worry about future compatibility with newer CUDA version. `-target-feature +ptx61` is used in this patch, which corresponds to the highest PTX version that CUDA 9.2 can support. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D97198
2021-03-08 12:02:55 -05:00 · 2021-03-08 12:02:55 -05:00 · c41ae246ac
parent 97a7bc5831
commit c41ae246ac
5 changed files with 44 additions and 97 deletions
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@ -711,7 +711,6 @@ void CudaToolChain::addClangTargetOptions(
  CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));

  clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
-  std::string CudaVersionStr;

  // New CUDA versions often introduce new instructions that are only supported
  // by new PTX version, so we need to raise PTX level to enable them in NVPTX
@ -720,7 +719,6 @@ void CudaToolChain::addClangTargetOptions(
  switch (CudaInstallationVersion) {
 #define CASE_CUDA_VERSION(CUDA_VER, PTX_VER)                                   \
  case CudaVersion::CUDA_##CUDA_VER:                                           \
-    CudaVersionStr = #CUDA_VER;                                                \
    PtxFeature = "+ptx" #PTX_VER;                                              \
    break;
    CASE_CUDA_VERSION(112, 72);
@ -734,9 +732,6 @@ void CudaToolChain::addClangTargetOptions(
    CASE_CUDA_VERSION(90, 60);
 #undef CASE_CUDA_VERSION
  default:
-    // If unknown CUDA version, we take it as CUDA 8.0. Same assumption is also
-    // made in libomptarget/deviceRTLs.
-    CudaVersionStr = "80";
    PtxFeature = "+ptx42";
  }
  CC1Args.append({"-target-feature", PtxFeature});
@ -757,8 +752,7 @@ void CudaToolChain::addClangTargetOptions(
      return;
    }

-    std::string BitcodeSuffix =
-        "nvptx-cuda_" + CudaVersionStr + "-" + GpuArch.str();
+    std::string BitcodeSuffix = "nvptx-" + GpuArch.str();
    addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
                       getTriple());
  }
--- a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc
+++ b/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@ -164,7 +164,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-USER %s

-// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-cuda_102-sm_35.bc
+// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-sm_35.bc
 // CHK-BCLIB-USER: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
 // CHK-BCLIB-NOT: {{error:|warning:}}

@ -177,7 +177,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-WARN %s

-// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-cuda_102-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.
+// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.

 /// ###########################################################################

--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@ -137,6 +137,7 @@ set(bc_flags -S -x c++ -O1 -std=c++14
             -Xclang -emit-llvm-bc
             -Xclang -aux-triple -Xclang ${aux_triple}
             -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+             -Xclang -target-feature -Xclang +ptx61
             -D__CUDACC__
             -I${devicertl_base_directory}
             -I${devicertl_nvptx_directory}/src)
@ -150,81 +151,51 @@ endif()
 # Create target to build all Bitcode libraries.
 add_custom_target(omptarget-nvptx-bc)

-# This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
-# The last element is the default case.
-set(cuda_version_list 112 111 110 102 101 100 92 91 90 80)
-set(ptx_feature_list 70 70 70 65 64 63 61 61 60 42)
-# The following two lines of ugly code is not needed when the minimal CMake
-# version requirement is 3.17+.
-list(LENGTH cuda_version_list num_version_supported)
-math(EXPR loop_range "${num_version_supported} - 1")
-
-# Generate a Bitcode library for all the compute capabilities the user
-# requested and all PTX version we know for now.
+# Generate a Bitcode library for all the compute capabilities the user requested
 foreach(sm ${nvptx_sm_list})
-  set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+  set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+  set(bc_files "")
+  foreach(src ${cuda_src_files})
+    get_filename_component(infile ${src} ABSOLUTE)
+    get_filename_component(outfile ${src} NAME)
+    set(outfile "${outfile}-sm_${sm}.bc")

-  # Uncomment the following code and remove those ugly part if the feature
-  # is available.
-  # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
-  foreach(itr RANGE ${loop_range})
-    list(GET cuda_version_list ${itr} cuda_version)
-    list(GET ptx_feature_list ${itr} ptx_num)
-    set(cuda_flags ${sm_flags})
-    list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
-    if("${cuda_version}" MATCHES "^([0-9]+)([0-9])$")
-      set(cuda_version_major ${CMAKE_MATCH_1})
-      set(cuda_version_minor ${CMAKE_MATCH_2})
-    else()
-      libomptarget_error_say(
-        "Unrecognized CUDA version format: ${cuda_version}")
-    endif()
-    list(APPEND cuda_flags
-      "-DCUDA_VERSION=${cuda_version_major}0${cuda_version_minor}0")
-
-    set(bc_files "")
-    foreach(src ${cuda_src_files})
-      get_filename_component(infile ${src} ABSOLUTE)
-      get_filename_component(outfile ${src} NAME)
-      set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
-
-      add_custom_command(OUTPUT ${outfile}
-        COMMAND ${cuda_compiler} ${bc_flags}
-          ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
-        DEPENDS ${infile}
-        IMPLICIT_DEPENDS CXX ${infile}
-        COMMENT "Building LLVM bitcode ${outfile}"
-        VERBATIM
-      )
-      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
-
-      list(APPEND bc_files ${outfile})
-    endforeach()
-
-    set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc")
-
-    # Link to a bitcode library.
-    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-        COMMAND ${bc_linker}
-          -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
-        DEPENDS ${bc_files}
-        COMMENT "Linking LLVM bitcode ${bclib_name}"
+    add_custom_command(OUTPUT ${outfile}
+      COMMAND ${cuda_compiler} ${bc_flags}
+        ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+      DEPENDS ${infile}
+      IMPLICIT_DEPENDS CXX ${infile}
+      COMMENT "Building LLVM bitcode ${outfile}"
+      VERBATIM
    )
-    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
+    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})

-    set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc")
-
-    add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
-    add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
-
-    # Copy library to destination.
-    add_custom_command(TARGET ${bclib_target_name} POST_BUILD
-                      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-                      ${LIBOMPTARGET_LIBRARY_DIR})
-
-    # Install bitcode library under the lib destination folder.
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+    list(APPEND bc_files ${outfile})
  endforeach()
+
+  set(bclib_name "libomptarget-nvptx-sm_${sm}.bc")
+
+  # Link to a bitcode library.
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+      COMMAND ${bc_linker}
+        -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+      DEPENDS ${bc_files}
+      COMMENT "Linking LLVM bitcode ${bclib_name}"
+  )
+  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
+
+  set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc")
+
+  add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
+  add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
+
+  # Copy library to destination.
+  add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                    ${LIBOMPTARGET_LIBRARY_DIR})
+
+  # Install bitcode library under the lib destination folder.
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
 endforeach()

 # Test will be enabled if the building machine supports CUDA
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@ -53,46 +53,28 @@ DEVICE double __kmpc_impl_get_wtime() {
  return (double)nsecs * __kmpc_impl_get_wtick();
 }

-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION < 9020
-  return __nvvm_vote_ballot(1);
-#else
  unsigned int Mask;
  asm volatile("activemask.b32 %0;" : "=r"(Mask));
  return Mask;
-#endif
 }

-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                     int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-#else
-  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
-#endif // CUDA_VERSION
 }

 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
                                          int32_t Var, uint32_t Delta,
                                          int32_t Width) {
  int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
-#if CUDA_VERSION >= 9000
  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-#else
-  return __nvvm_shfl_down_i32(Var, Delta, T);
-#endif // CUDA_VERSION
 }

 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }

 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
  __nvvm_bar_warp_sync(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
 }

 // NVPTX specific kernel initialization