SparseTensorToCSRSparseMatrix

Signed-off-by: izayoi <dingyahao@huawei.com>
2022-11-18 10:27:45 +08:00 · 2022-11-18 10:27:45 +08:00 · 4f73a617c0
parent d28ce2e019
commit 4f73a617c0
2 changed files with 21 additions and 14 deletions
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_tensor_to_csr_sparse_matrix_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/sparse_tensor_to_csr_sparse_matrix_impl.cu
@ -28,8 +28,8 @@ __global__ void SparseTensorToCSRSparseMatrixKernel(const IndiceType* x_indices_
                                                    int total_num,
                                                    int rank) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < total_num; i += blockDim.x * gridDim.x) {
-        out_row_indices_ptr[i] = static_cast<IndiceType>(__ldg(x_indices_ptr + i * rank + rank - 2));
-        out_col_indices_ptr[i] = static_cast<IndiceType>(__ldg(x_indices_ptr + i * rank + rank - 1));
+        out_row_indices_ptr[i] = x_indices_ptr[i * rank + rank - 2];
+        out_col_indices_ptr[i] = x_indices_ptr[i * rank + rank - 1];
        if (rank == 3) {
            IndiceType batch = x_indices_ptr[i * rank];
            MsAtomicMax(out_batch_pointers_ptr + batch + 1, i + 1);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/sparse/sparse_tensor_to_csr_sparse_matrix_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/sparse/sparse_tensor_to_csr_sparse_matrix_gpu_kernel.cc
@ -46,6 +46,19 @@ bool SparseTensorToCSRSparseMatrixGpuKernelMod::Init(const BaseOperatorPtr &base
    MS_LOG(ERROR) << "For '" << kernel_name_ << "' got empty inputs, which is invalid.";
    return false;
  }
+  handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCuSparseHandle();
+  cusparseSetPointerMode(handle_, CUSPARSE_POINTER_MODE_HOST);
+  return true;
+}
+
+int SparseTensorToCSRSparseMatrixGpuKernelMod::Resize(const BaseOperatorPtr &base_operator,
+                                                      const std::vector<KernelTensorPtr> &inputs,
+                                                      const std::vector<KernelTensorPtr> &outputs,
+                                                      const std::map<uint32_t, tensor::TensorPtr> &) {
+  input_size_list_.clear();
+  output_size_list_.clear();
+  workspace_size_list_.clear();
+  auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
  for (size_t i = 0; i < inputs.size(); i++) {
    std::vector<int64_t> input_shape = std::vector<int64_t>(inputs.at(i)->GetDeviceShapeAdaptively().begin(),
                                                            inputs.at(i)->GetDeviceShapeAdaptively().end());
@ -69,15 +82,6 @@ bool SparseTensorToCSRSparseMatrixGpuKernelMod::Init(const BaseOperatorPtr &base
      y_batch_pointers_ptr_test.resize(output_elements_);
    }
  }
-  handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCuSparseHandle();
-  cusparseSetPointerMode(handle_, CUSPARSE_POINTER_MODE_HOST);
-  return true;
-}
-
-int SparseTensorToCSRSparseMatrixGpuKernelMod::Resize(const BaseOperatorPtr &base_operator,
-                                                      const std::vector<KernelTensorPtr> &inputs,
-                                                      const std::vector<KernelTensorPtr> &outputs,
-                                                      const std::map<uint32_t, tensor::TensorPtr> &) {
  return KRET_OK;
 }

@ -97,9 +101,6 @@ bool SparseTensorToCSRSparseMatrixGpuKernelMod::LaunchKernel(const std::vector<A

  cudaMemcpyAsync(x_dense_shape_ptr_test.data(), x_dense_shape_ptr, elements[kTwo] * sizeof(IndiceType),
                  cudaMemcpyDeviceToHost, stream);
-  cudaMemcpyAsync(y_value_ptr, x_value_ptr, elements[kOne] * sizeof(DataType), cudaMemcpyDeviceToDevice, stream);
-  cudaMemcpyAsync(y_dense_shape_ptr, x_dense_shape_ptr, elements[kTwo] * sizeof(DataType), cudaMemcpyDeviceToDevice,
-                  stream);
  SparseTensorToCSRSparseMatrix<IndiceType>(x_indices_ptr, x_row_indices_ptr, y_col_indices_ptr, y_batch_pointers_ptr,
                                            elements[kOne], elements[kTwo], stream, device_id_);
  if (elements[kTwo] == kRankWithoutBatch) {
@ -111,6 +112,7 @@ bool SparseTensorToCSRSparseMatrixGpuKernelMod::LaunchKernel(const std::vector<A
    cudaMemcpyAsync(y_batch_pointers_ptr_test.data(), y_batch_pointers_ptr, (batch_size + 1) * sizeof(IndiceType),
                    cudaMemcpyDeviceToHost, stream);
    for (int i = 0; i < batch_size; ++i) {
+      y_batch_pointers_ptr_test[i + 1] = std::max(y_batch_pointers_ptr_test[i + 1], y_batch_pointers_ptr_test[i]);
      int *temp_row_indices_addr = x_row_indices_ptr + y_batch_pointers_ptr_test[i];
      int *temp_row_pointers_addr = y_row_pointers_ptr + i * (row_num + 1);
      temp_nnz = y_batch_pointers_ptr_test[i + 1] - y_batch_pointers_ptr_test[i];
@ -119,7 +121,12 @@ bool SparseTensorToCSRSparseMatrixGpuKernelMod::LaunchKernel(const std::vector<A
                         CUSPARSE_INDEX_BASE_ZERO);
      }
    }
+    cudaMemcpyAsync(y_batch_pointers_ptr, y_batch_pointers_ptr_test.data(), (batch_size + 1) * sizeof(IndiceType),
+                    cudaMemcpyHostToDevice, stream);
  }
+  cudaMemcpyAsync(y_dense_shape_ptr, x_dense_shape_ptr, elements[kTwo] * sizeof(DataType), cudaMemcpyDeviceToDevice,
+                  stream);
+  cudaMemcpyAsync(y_value_ptr, x_value_ptr, elements[kOne] * sizeof(DataType), cudaMemcpyDeviceToDevice, stream);
  return true;
 }