!43958 Add cucollections cmake

Merge pull request !43958 from zyli2020/dynamic_embedding_compile
2022-10-25 14:46:42 +00:00 · 2022-10-25 14:46:42 +00:00 · 04fab5011f
parent 9d365c662b 0f643987eb
commit 04fab5011f
5 changed files with 529 additions and 0 deletions
--- a/cmake/external_libs/cucollections.cmake
+++ b/cmake/external_libs/cucollections.cmake
@ -0,0 +1,13 @@
+set(REQ_URL "https://github.com/NVIDIA/cuCollections/archive/d6ba69b1fdab90ae144301e77eb93a2f130ede1d.tar.gz")
+set(MD5 "196a453e5db52e904a906b13b2b8771c")
+set(INCLUDE "include")
+
+mindspore_add_pkg(cucollections
+        HEAD_ONLY ${INCLUDE}
+        URL ${REQ_URL}
+        MD5 ${MD5}
+        PATCHES ${TOP_DIR}/third_party/patch/cucollections/0001-refine-bitwise-compare.patch
+        PATCHES ${TOP_DIR}/third_party/patch/cucollections/0002-add-get-api-of-dynamic_map.patch
+        PATCHES ${TOP_DIR}/third_party/patch/cucollections/0003-add-erase-and-export-api.patch
+        )
+include_directories(${cucollections_INC})
--- a/cmake/mind_expression.cmake
+++ b/cmake/mind_expression.cmake
@ -58,6 +58,7 @@ endif()

 if(ENABLE_GPU AND GPU_BACKEND_CUDA)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/cub.cmake)
+    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/cucollections.cmake)
    if(ENABLE_MPI)
        include(${CMAKE_SOURCE_DIR}/cmake/external_libs/nccl.cmake)
    endif()
--- a/third_party/patch/cucollections/0001-refine-bitwise-compare.patch
+++ b/third_party/patch/cucollections/0001-refine-bitwise-compare.patch
@ -0,0 +1,53 @@
+---
+ include/cuco/detail/pair.cuh |  3 +--
+ include/cuco/traits.hpp      | 23 +----------------------
+ 2 files changed, 2 insertions(+), 24 deletions(-)
+
+diff --git a/include/cuco/detail/pair.cuh b/include/cuco/detail/pair.cuh
+index 7ea3988..ade6df3 100644
+--- a/include/cuco/detail/pair.cuh
+++ b/include/cuco/detail/pair.cuh
+@@ -131,8 +131,7 @@ template <typename pair_type,
+           typename value_type = typename pair_type::second_type>
+ constexpr bool is_packable()
+ {
+-  return not std::is_void<packed_t<pair_type>>::value and
+-         std::has_unique_object_representations_v<pair_type>;
+  return false;
+ }
+ 
+ /**
+diff --git a/include/cuco/traits.hpp b/include/cuco/traits.hpp
+index 445a40d..948b587 100644
+--- a/include/cuco/traits.hpp
+++ b/include/cuco/traits.hpp
+@@ -34,28 +34,7 @@ namespace cuco {
+  * other `NaN` bit patterns.
+  *
+  */
+-template <typename T, typename = void>
+-struct is_bitwise_comparable : std::false_type {
+-};
+ 
+-/// By default, only types with unique object representations are allowed
+ template <typename T>
+-struct is_bitwise_comparable<T, std::enable_if_t<std::has_unique_object_representations_v<T>>>
+-  : std::true_type {
+-};
+-
+-template <typename T>
+-inline constexpr bool is_bitwise_comparable_v = is_bitwise_comparable<T>::value;
+-
+-/**
+- * @brief Declares that a type `Type` is bitwise comparable.
+- *
+- */
+-#define CUCO_DECLARE_BITWISE_COMPARABLE(Type)           \
+-  namespace cuco {                                      \
+-  template <>                                           \
+-  struct is_bitwise_comparable<Type> : std::true_type { \
+-  };                                                    \
+-  }
+-
+inline constexpr bool is_bitwise_comparable_v = true;
+ }  // namespace cuco
--- a/third_party/patch/cucollections/0002-add-get-api-of-dynamic_map.patch
+++ b/third_party/patch/cucollections/0002-add-get-api-of-dynamic_map.patch
@ -0,0 +1,105 @@
+---
+ include/cuco/dynamic_map.cuh | 62 ++++++++++++++++++++++++++++++++++--
+ include/cuco/traits.hpp      |  2 +-
+ 2 files changed, 60 insertions(+), 4 deletions(-)
+
+diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
+index 866f948..af3ea03 100644
+--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
+@@ -103,8 +103,8 @@ class dynamic_map {
+   using key_type        = Key;                               ///< Key type
+   using mapped_type     = Value;                             ///< Type of mapped values
+   using atomic_ctr_type = cuda::atomic<std::size_t, Scope>;  ///< Type of atomic counters
+-  using view_type = typename static_map<Key, Value, Scope>::device_view;  ///< Device view type
+-  using mutable_view_type = typename static_map<Key, Value, Scope>::device_mutable_view;
+  using view_type = typename static_map<Key, Value, Scope, Allocator>::device_view;  ///< Device view type
+  using mutable_view_type = typename static_map<Key, Value, Scope, Allocator>::device_mutable_view;
+   ///< Device mutable view type
+ 
+   dynamic_map(dynamic_map const&) = delete;
+@@ -248,6 +248,62 @@ class dynamic_map {
+    */
+   float get_load_factor() const noexcept { return static_cast<float>(size_) / capacity_; }
+ 
+  /**
+   * @brief Update the size of the hash map.
+   *
+   * @param size The number of the size to be updated.
+   */
+  void update_size(std::size_t size) {
+    size_ = size;
+  }
+
+  /**
+   * @brief Update the size of the submap.
+   *
+   * @param submap_idx The index of submap whose size need to be updated.
+   * @param size The number of the size of submap to be updated.
+   */
+  void update_submap_size(std::size_t submap_idx, std::size_t size) {
+    submaps_[submap_idx]->size_ = size;
+  }
+
+  /**
+   * @brief Gets the all submaps of the hash map.
+   *
+   * @return The all submaps of the hash map.
+   */
+  const std::vector<std::unique_ptr<static_map<key_type, mapped_type, Scope, Allocator>>>& get_submaps() const noexcept {
+    return submaps_;
+  }
+
+  /**
+   * @brief Gets the all mutable views for all submaps of the hash map.
+   *
+   * @return All mutable views for all submaps of the hash map.
+   */
+  thrust::device_vector<mutable_view_type>& get_submap_mutable_views() noexcept { return submap_mutable_views_; }
+
+  /**
+   * @brief Gets the all mutable views for all submaps of the hash map.
+   *
+   * @return All mutable views for all submaps of the hash map.
+   */
+  thrust::device_vector<view_type>& get_submap_views() noexcept { return submap_views_; }
+
+  /**
+   * @brief Gets the max load factor of the hash map.
+   *
+   * @return The max load factor of the hash map.
+   */
+  float get_max_load_factor() const noexcept { return max_load_factor_; }
+
+  /**
+   * @brief Gets minimum insert size of the hash map.
+   *
+   * @return The minimum insert size of the hash map.
+   */
+  std::size_t get_min_insert_size() const noexcept { return min_insert_size_; }
+
+  private:
+   key_type empty_key_sentinel_{};       ///< Key value that represents an empty slot
+   mapped_type empty_value_sentinel_{};  ///< Initial value of empty slot
+@@ -255,7 +311,7 @@ class dynamic_map {
+   std::size_t capacity_{};              ///< Maximum number of keys that can be inserted
+   float max_load_factor_{};             ///< Max load factor before capacity growth
+ 
+-  std::vector<std::unique_ptr<static_map<key_type, mapped_type, Scope>>>
+  std::vector<std::unique_ptr<static_map<key_type, mapped_type, Scope, Allocator>>>
+     submaps_;                                      ///< vector of pointers to each submap
+   thrust::device_vector<view_type> submap_views_;  ///< vector of device views for each submap
+   thrust::device_vector<mutable_view_type>
+diff --git a/include/cuco/traits.hpp b/include/cuco/traits.hpp
+index 948b587..b7fbbc4 100644
+--- a/include/cuco/traits.hpp
+++ b/include/cuco/traits.hpp
+@@ -16,7 +16,7 @@
+ 
+ #pragma once
+ 
+-#include <type_traits>
+// #include <type_traits>
+ 
+ namespace cuco {
+
--- a/third_party/patch/cucollections/0003-add-erase-and-export-api.patch
+++ b/third_party/patch/cucollections/0003-add-erase-and-export-api.patch
@ -0,0 +1,357 @@
+---
+ include/cuco/detail/bitwise_compare.cuh     |  1 +
+ include/cuco/detail/dynamic_map.inl         | 98 ++++++++++++++++++++-
+ include/cuco/detail/dynamic_map_kernels.cuh | 83 +++++++++++++++++
+ include/cuco/dynamic_map.cuh                | 58 +++++++++++-
+ 4 files changed, 236 insertions(+), 4 deletions(-)
+
+diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh
+index 3038943..4bd58c2 100644
+--- a/include/cuco/detail/bitwise_compare.cuh
+++ b/include/cuco/detail/bitwise_compare.cuh
+@@ -18,6 +18,7 @@
+ 
+ #include <cstdint>
+ #include <type_traits>
+#include <cuco/traits.hpp>
+ 
+ namespace cuco {
+ namespace detail {
+diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
+index 0c1d2e3..2425c7d 100644
+--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
+@@ -21,30 +21,68 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
+   std::size_t initial_capacity,
+   sentinel::empty_key<Key> empty_key_sentinel,
+   sentinel::empty_value<Value> empty_value_sentinel,
+-  Allocator const& alloc)
+  Allocator const& alloc,
+  cudaStream_t stream)
+   : empty_key_sentinel_(empty_key_sentinel.value),
+     empty_value_sentinel_(empty_value_sentinel.value),
+     size_(0),
+     capacity_(initial_capacity),
+     min_insert_size_(1E4),
+     max_load_factor_(0.60),
+    counter_allocator_{alloc},
+     alloc_{alloc}
+ {
+   submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+     initial_capacity,
+     sentinel::empty_key<Key>{empty_key_sentinel},
+     sentinel::empty_value<Value>{empty_value_sentinel},
+-    alloc));
+    alloc, stream));
+   submap_views_.push_back(submaps_[0]->get_device_view());
+   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
+ 
+   CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
+-}  // namespace cuco
+  d_submaps_erase_num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, max_num_submaps_);
+  CUCO_CUDA_TRY(cudaMallocHost(&h_submaps_erase_num_successes_, sizeof(atomic_ctr_type) * (max_num_submaps_)));
+}
+
+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
+dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
+  std::size_t initial_capacity,
+  sentinel::empty_key<Key> empty_key_sentinel,
+  sentinel::empty_value<Value> empty_value_sentinel,
+  sentinel::erased_key<Key> erased_key_sentinel,
+  Allocator const& alloc,
+  cudaStream_t stream)
+  : empty_key_sentinel_(empty_key_sentinel.value),
+    empty_value_sentinel_(empty_value_sentinel.value),
+    erased_key_sentinel_{erased_key_sentinel.value},
+    size_(0),
+    capacity_(initial_capacity),
+    min_insert_size_(1E4),
+    max_load_factor_(0.60),
+    counter_allocator_{alloc},
+    alloc_{alloc}
+{
+  submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+    initial_capacity,
+    sentinel::empty_key<Key>{empty_key_sentinel},
+    sentinel::empty_value<Value>{empty_value_sentinel},
+    sentinel::erased_key<Key>{erased_key_sentinel},
+    alloc, stream));
+  submap_views_.push_back(submaps_[0]->get_device_view());
+  submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
+
+  CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
+  d_submaps_erase_num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, max_num_submaps_);
+  CUCO_CUDA_TRY(cudaMallocHost(&h_submaps_erase_num_successes_, sizeof(atomic_ctr_type) * (max_num_submaps_)));
+}
+ 
+ template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
+ dynamic_map<Key, Value, Scope, Allocator>::~dynamic_map()
+ {
+   CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_));
+  std::allocator_traits<counter_allocator_type>::deallocate(counter_allocator_, d_submaps_erase_num_successes_ , max_num_submaps_);
+  CUCO_ASSERT_CUDA_SUCCESS(cudaFreeHost(reinterpret_cast<void *>(h_submaps_erase_num_successes_)));
+ }
+ 
+ template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
+@@ -75,6 +113,9 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
+ 
+     num_elements_remaining -= max_load_factor_ * submap_capacity - min_insert_size_;
+     submap_idx++;
+    if (submap_idx > max_num_submaps_) {
+      throw std::runtime_error("The number of submaps exceeds the maximum[256]");
+    }
+   }
+ }
+ 
+@@ -160,4 +201,55 @@ void dynamic_map<Key, Value, Scope, Allocator>::contains(
+   CUCO_CUDA_TRY(cudaDeviceSynchronize());
+ }
+ 
+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
+template <typename InputIt, typename Hash, typename KeyEqual>
+void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first, InputIt last,
+      cudaStream_t stream, Hash hash, KeyEqual key_equal) {
+  auto num_keys = std::distance(first, last);
+  if (num_keys == 0) { return; }
+
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+
+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
+  for(size_t i = 0; i < max_num_submaps_; i++) {
+    h_submaps_erase_num_successes_[i] = 0;
+  }
+
+  CUCO_CUDA_TRY(cudaMemcpyAsync(
+    d_submaps_erase_num_successes_, h_submaps_erase_num_successes_, submaps_.size() * sizeof(atomic_ctr_type),
+    cudaMemcpyHostToDevice, stream));
+
+  detail::erase<block_size, tile_size><<<grid_size, block_size, sizeof(atomic_ctr_type) * submaps_.size(), stream>>>(
+    first, first + num_keys, d_submaps_erase_num_successes_, submap_mutable_views_.data().get(), submaps_.size(), hash, key_equal);
+
+  CUCO_CUDA_TRY(cudaMemcpyAsync(
+    h_submaps_erase_num_successes_, d_submaps_erase_num_successes_, submaps_.size() * sizeof(atomic_ctr_type),
+    cudaMemcpyDeviceToHost, stream));
+
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+  for(size_t submap_idx = 0; submap_idx < submaps_.size(); submap_idx++){
+    submaps_[submap_idx]->size_ -= h_submaps_erase_num_successes_[submap_idx];
+    size_ -= h_submaps_erase_num_successes_[submap_idx];
+  }
+}
+
+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
+bool dynamic_map<Key, Value, Scope, Allocator>::get_keys_values(Key *keys, Value *values, cudaStream_t stream) {
+  *num_successes_ = 0;
+  int device_id;
+  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
+  CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
+
+  auto const block_size = 128;
+  auto const stride     = 1;
+  auto const grid_size  = (size_ + stride * block_size - 1) / (stride * block_size);
+  detail::get_keys_values<<<grid_size, block_size, sizeof(atomic_ctr_type), stream>>>(submaps_.size(), submap_views_.data().get(), num_successes_, keys, values);
+
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+  size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed);
+  return h_num_successes == size_;
+}
+ }  // namespace cuco
+diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
+index f261b49..75b2c07 100644
+--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
+@@ -20,6 +20,7 @@
+ #include <cuda/std/atomic>
+ 
+ #include <cooperative_groups.h>
+#include <cuco/detail/bitwise_compare.cuh>
+ 
+ namespace cuco {
+ namespace detail {
+@@ -463,5 +464,87 @@ __global__ void contains(InputIt first,
+     key_idx += (gridDim.x * blockDim.x) / tile_size;
+   }
+ }
+
+template <std::size_t block_size,
+          uint32_t tile_size,
+          typename InputIt,
+          typename atomicT,
+          typename viewT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void erase(
+  InputIt first, InputIt last, atomicT* num_successes, viewT* views, std::size_t num_submaps, Hash hash, KeyEqual key_equal)
+{
+  extern __shared__ atomicT local_num_successes[];
+
+  if (threadIdx.x < num_submaps) {
+    local_num_successes[threadIdx.x] = 0;
+  }
+  __syncthreads();
+
+  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid  = block_size * blockIdx.x + threadIdx.x;
+  auto it   = first + tid / tile_size;
+
+  while (it < last) {
+    for (auto submap_idx = 0; submap_idx < num_submaps; ++submap_idx) {
+      if (views[submap_idx].erase(tile, *it, hash, key_equal)) {
+        if (tile.thread_rank() == 0) {
+          local_num_successes[submap_idx] += 1;
+        }
+        break;
+      }
+    }
+    it += (gridDim.x * block_size) / tile_size;
+  }
+
+  __syncthreads();
+  if (threadIdx.x < num_submaps) {
+    num_successes[threadIdx.x] += local_num_successes[threadIdx.x];
+  }
+}
+
+template<typename Key, typename Value, typename ViewType, typename AtomicType>
+__global__ void get_keys_values(size_t num_submaps, ViewType *submap_views, AtomicType* global_cnt, Key* keys, Value*values) {
+  __shared__ size_t global_offset;
+  extern __shared__ AtomicType local_cnt[];
+  const int default_offset_sentinel = -1;
+
+  for (size_t submap_idx = 0; submap_idx < num_submaps; submap_idx++){
+    auto & submap_view = submap_views[submap_idx];
+
+    for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < submap_view.get_capacity();
+         tid += blockDim.x * gridDim.x) {
+      if (threadIdx.x == 0) {
+        local_cnt[0] = 0;
+      }
+      __syncthreads();
+
+      auto current_slot = submap_view.begin_slot() + tid;
+      const Key & current_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+      auto const slot_not_idle =
+        !detail::bitwise_compare(current_key, submap_view.get_empty_key_sentinel()) &&
+        !detail::bitwise_compare(current_key, submap_view.get_erased_key_sentinel());
+
+      int local_offset = default_offset_sentinel;
+      if(slot_not_idle) {
+        local_offset = local_cnt[0].fetch_add(1, cuda::std::memory_order_relaxed);
+      }
+      __syncthreads();
+
+      if (threadIdx.x == 0) {
+        auto local_cnt_value = local_cnt[0].load(cuda::std::memory_order_relaxed);
+        global_offset = global_cnt->fetch_add(local_cnt_value, cuda::std::memory_order_relaxed);
+      }
+      __syncthreads();
+
+      if (local_offset > default_offset_sentinel) {
+        auto offset = global_offset + local_offset;
+          keys[offset] = current_key;
+          values[offset] = current_slot->second.load(cuda::std::memory_order_relaxed);
+      }
+    }
+  }
+}
+ }  // namespace detail
+ }  // namespace cuco
+diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
+index af3ea03..9ed2f25 100644
+--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
+@@ -105,6 +105,8 @@ class dynamic_map {
+   using atomic_ctr_type = cuda::atomic<std::size_t, Scope>;  ///< Type of atomic counters
+   using view_type = typename static_map<Key, Value, Scope, Allocator>::device_view;  ///< Device view type
+   using mutable_view_type = typename static_map<Key, Value, Scope, Allocator>::device_mutable_view;
+  using counter_allocator_type = typename static_map<Key, Value, Scope, Allocator>::counter_allocator_type;
+
+   ///< Device mutable view type
+ 
+   dynamic_map(dynamic_map const&) = delete;
+@@ -135,7 +137,36 @@ class dynamic_map {
+   dynamic_map(std::size_t initial_capacity,
+               sentinel::empty_key<Key> empty_key_sentinel,
+               sentinel::empty_value<Value> empty_value_sentinel,
+-              Allocator const& alloc = Allocator{});
+              Allocator const& alloc = Allocator{},
+              cudaStream_t stream = 0);
+
+  /**
+   * @brief Construct a dynamically-sized map with the specified initial capacity, growth factor and
+   * sentinel values.
+   *
+   * The capacity of the map will automatically increase as the user adds key/value pairs using
+   * `insert`.
+   *
+   * Capacity increases by a factor of growth_factor each time the size of the map exceeds a
+   * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the
+   * map's capacity grows.
+   *
+   * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and
+   * undefined behavior results from attempting to insert any key/value pair
+   * that contains either.
+   *
+   * @param initial_capacity The initial number of slots in the map
+   * @param empty_key_sentinel The reserved key value for empty slots
+   * @param empty_value_sentinel The reserved mapped value for empty slots
+   * @param erased_key_sentinel The reserved value to denote erased slots
+   * @param alloc Allocator used to allocate submap device storage
+   */
+dynamic_map(std::size_t initial_capacity,
+            sentinel::empty_key<Key> empty_key_sentinel,
+            sentinel::empty_value<Value> empty_value_sentinel,
+            sentinel::erased_key<Key> erased_key_sentinel,
+            Allocator const& alloc = Allocator(),
+            cudaStream_t stream = 0);
+ 
+   /**
+    * @brief Destroy the map and frees its contents
+@@ -227,6 +258,25 @@ class dynamic_map {
+                 Hash hash          = Hash{},
+                 KeyEqual key_equal = KeyEqual{});
+ 
+  template <typename InputIt,
+            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename KeyEqual = thrust::equal_to<key_type>>
+  void erase(InputIt first,
+             InputIt last,
+             cudaStream_t stream = 0,
+             Hash hash           = Hash{},
+             KeyEqual key_equal  = KeyEqual{});
+
+  /**
+   * @brief Get all keys and values in the hash map.
+   *
+   * @param keys The output parameter, pointing the buffer which will maintain all keys in the hash map.
+   * @param values The output parameter, pointing the buffer which will maintain all values in the hash map.
+   * @param stream The cuda stream.
+   * @return Whether export keys and values successfully.
+   */
+  bool get_keys_values(Key *keys, Value *values, cudaStream_t stream = 0);
+
+   /**
+    * @brief Gets the current number of elements in the map
+    *
+@@ -307,6 +357,7 @@ class dynamic_map {
+  private:
+   key_type empty_key_sentinel_{};       ///< Key value that represents an empty slot
+   mapped_type empty_value_sentinel_{};  ///< Initial value of empty slot
+  key_type erased_key_sentinel_{};  ///< Key value that represents an erased slot
+   std::size_t size_{};                  ///< Number of keys in the map
+   std::size_t capacity_{};              ///< Maximum number of keys that can be inserted
+   float max_load_factor_{};             ///< Max load factor before capacity growth
+@@ -319,6 +370,11 @@ class dynamic_map {
+   std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
+   atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
+   Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
+
+  counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate counters
+  atomic_ctr_type* d_submaps_erase_num_successes_; ///< number of successfully erased keys on erase, atomic on device.
+  atomic_ctr_type* h_submaps_erase_num_successes_; ///< number of successfully erased keys on erase, atomic on host.
+  const size_t max_num_submaps_ = 256;  ///< The max number of submaps.
+ };
+ }  // namespace cuco