!32414 RL add gpu kernel: batch read write

Merge pull request !32414 from VectorSL/read_write_op
This commit is contained in:
i-robot 2022-04-07 02:16:34 +00:00 committed by Gitee
commit 4adf5064fc
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
3 changed files with 176 additions and 0 deletions

View File

@ -0,0 +1,45 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_BASE_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_BASE_H_
#include <string>
#include <vector>
#include <shared_mutex>
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class BatchAssignBaseMod : public NativeGpuKernelMod {
public:
BatchAssignBaseMod() = default;
~BatchAssignBaseMod() override = default;
virtual bool Init(const CNodePtr &kernel_node) = 0;
protected:
void InitSizeLists() {}
// Using shared_mutex to achieve the followings
// The read-write lock can only have one writer or multiple readers at the same time,
// but it can't have both readers and writers at the same time.
static std::shared_mutex rw_mutex_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_BASE_H_

View File

@ -0,0 +1,85 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <memory>
#include <shared_mutex>
#include "plugin/device/gpu/kernel/rl/batch_assign_gpu_kernel.h"
#include "kernel/common_utils.h"
namespace mindspore {
namespace kernel {
constexpr size_t kHalf = 2;
// Init shared_mutex in base.
std::shared_mutex BatchAssignBaseMod::rw_mutex_;
BatchAssignKernelMod::BatchAssignKernelMod() : elements_num_(0), lock_(false) {}
bool BatchAssignKernelMod::Init(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_node_ = kernel_node;
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
lock_ = GetAttr<bool>(kernel_node, "lock");
size_t input_num = common::AnfAlgo::GetInputNum(kernel_node);
elements_num_ = input_num / kHalf;
// Compute the size for each input. There has two input lists.
// Each list has the same elements number, shape series type series.
for (size_t i = 0; i < elements_num_; i++) {
auto type = AnfAlgo::GetInputDeviceDataType(kernel_node, i);
size_t element_size = GetTypeByte(TypeIdToType(type));
auto shape = AnfAlgo::GetInputDeviceShape(kernel_node, i);
for (auto x : shape) {
element_size *= LongToSize(x);
}
input_size_list_.push_back(element_size);
}
// Set input size for another input list.
for (size_t i = 0; i < elements_num_; i++) {
input_size_list_.push_back(input_size_list_[i]);
}
// Set an output for placeholder.
output_size_list_.push_back(sizeof(float));
return true;
}
bool BatchAssignKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &, void *stream) {
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
// Using shared lock for reader so there can be more than one readers in the same time.
// Using lock for writer to ensure there's only one writer at a time.
if (lock_) {
// Execute rw_mutex_.unlock() in lock's deconstruct.
std::unique_lock<std::shared_mutex> lock(rw_mutex_);
} else {
// Execute rw_mutex_.unlock_shared() in lock's deconstruct.
std::shared_lock<std::shared_mutex> lock(rw_mutex_);
}
// Usually, we will get two inputs list, the first half are the weights to be updated, and the last half
// are the sources. So we just copy the source to overwrite the dst.
for (size_t i = 0; i < elements_num_; i++) {
auto local_addr = GetDeviceAddress<int64_t>(inputs, i);
auto source_addr = GetDeviceAddress<int64_t>(inputs, i + elements_num_);
MS_ERROR_IF_NULL(local_addr);
MS_ERROR_IF_NULL(source_addr);
CHECK_CUDA_RET_WITH_EXCEPT(
kernel_node_,
cudaMemcpyAsync(local_addr, source_addr, input_size_list_[i], cudaMemcpyDeviceToDevice, cuda_stream),
"Overwrite failed");
}
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(cuda_stream),
"BatchAssignKernel cudaStreamSynchronized failed");
return true;
}
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,46 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_KERNEL_H_
#include <string>
#include <vector>
#include "plugin/device/gpu/kernel/rl/batch_assign_gpu_base.h"
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class BatchAssignKernelMod : public BatchAssignBaseMod {
public:
BatchAssignKernelMod();
~BatchAssignKernelMod() = default;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
bool Init(const CNodePtr &kernel_node) override;
private:
size_t elements_num_;
bool lock_;
};
MS_REG_GPU_KERNEL(BatchAssign, BatchAssignKernelMod)
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_KERNEL_H_