!46383 bugfix for gpu hash table operators' result is error on GPU3090

Merge pull request !46383 from zyli2020/r2.0.0-alpha
2022-12-07 07:57:54 +00:00 · 2022-12-07 07:57:54 +00:00 · 02473f3389
parent b3d8209ffe 35685c47b5
commit 02473f3389
3 changed files with 8 additions and 3 deletions
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/CMakeLists.txt
@ -35,6 +35,9 @@ if(ENABLE_GPU)
        string(REPLACE "-arch=sm_53;" "" CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
        string(REPLACE "-gencode=arch=compute_53,code=sm_53;" "" CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
        list(APPEND CUDA_NVCC_FLAGS -gencode=arch=compute_70,code=sm_70)
+        list(APPEND CUDA_NVCC_FLAGS -gencode=arch=compute_75,code=sm_75)
+        list(APPEND CUDA_NVCC_FLAGS -gencode=arch=compute_80,code=sm_80)
+        list(APPEND CUDA_NVCC_FLAGS -gencode=arch=compute_86,code=compute_86)
        list(APPEND CUDA_NVCC_FLAGS -std=c++17)
        list(REMOVE_DUPLICATES CUDA_NVCC_FLAGS)
        cuda_add_library(gpu_hash_table STATIC ${CUDA_SRC_LIST})
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_hash_table.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_hash_table.cu
@ -268,7 +268,9 @@ bool GPUHashTable<Key, Value, Allocator>::Insert(const Key *keys, size_t key_num

  // 2. Insert values into map by indices in blocks.
  size_t total_insert_size = value_dim_ * key_num;
-  InsertValues<<<GET_BLOCKS(total_insert_size), GET_THREADS, 0, cuda_stream>>>(
+  auto block_size = GET_THREADS_MAXSIZE(kBlockSize);
+  auto grid_size = CUDA_BLOCKS_CAL(GET_CTX_DEVICE_ID, total_insert_size, block_size);
+  InsertValues<<<grid_size, block_size, 0, cuda_stream>>>(
    value_dim_, total_insert_size, indices, value, elements_per_block_, lookup_cnts_ptr_, min_lookup_cnt_before_permit_,
    global_timestamp_, update_timestamps_ptr_, idle_flags_ptr_, blocks_ptr_);

--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_hash_table.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_hash_table.h
@ -219,7 +219,7 @@ class GPUHashTable : public HashTable<Key, Value> {
  size_t capacity_{0};

  // The number of elements of one block.
-  static const size_t elements_per_block_{kInitialCapacity};
+  static constexpr size_t elements_per_block_{kInitialCapacity};

  // Record the number of successfully inserted keys.
  cuda::atomic<std::size_t, cuda::thread_scope_device> *insert_success_number_{nullptr};
@ -230,7 +230,7 @@ class GPUHashTable : public HashTable<Key, Value> {
  curandStatePhilox4_32_10_t *random_gen_state_{nullptr};

  // The block size used to launch cuda kernel for inserting normal distribution random values.
-  int random_gen_threads_per_block_{GET_THREADS};
+  int random_gen_threads_per_block_{kBlockSize};
  // The grid size used to launch cuda kernel for inserting normal distribution random values.
  int random_gen_block_count_{(kMaxThreadsPerBlockRandomGen - 1) / random_gen_threads_per_block_ + 1};