diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/inplace_op_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/inplace_op_gpu_kernel.cc index da87cba8e53..e66c6812fd8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/inplace_op_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/inplace_op_gpu_kernel.cc @@ -23,8 +23,6 @@ static std::unordered_map op_type_map = { {"InplaceUpdate", INPLACE_OP_TYPE_UPDATE}, {"InplaceAdd", INPLACE_OP_TYPE_ADD}, {"InplaceSub", INPLACE_OP_TYPE_SUB}}; bool InplaceOpGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, const std::vector &outputs) { - // auto kernel_ptr_ = std::dynamic_pointer_cast(base_operator); - // kernel_name_ = kernel_ptr_->name(); kernel_name_ = base_operator->name(); auto iter = op_type_map.find(kernel_name_); if (iter == op_type_map.end()) { diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/inplace_op_v2_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/inplace_op_v2_gpu_kernel.cc new file mode 100644 index 00000000000..cf76ae5b67d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/inplace_op_v2_gpu_kernel.cc @@ -0,0 +1,154 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/kernel/arrays/inplace_op_v2_gpu_kernel.h" +#include +#include +namespace mindspore { +namespace kernel { +static std::unordered_map op_type_map = {{"InplaceUpdateV2", INPLACE_OP_TYPE_UPDATE}}; +bool InplaceOpV2GpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs) { + kernel_name_ = base_operator->name(); + auto iter = op_type_map.find(kernel_name_); + if (iter == op_type_map.end()) { + MS_LOG(ERROR) << "For InplaceOpV2 kernel, Can only support InplaceUpdateV2, but got " << kernel_name_; + return false; + } + kernel_type_ = iter->second; + if (inputs.empty() || outputs.empty()) { + MS_LOG(ERROR) << "For '" << kernel_name_ << "' got empty inputs or outputs, which is invalid."; + return false; + } + auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs); + auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport()); + if (!is_match) { + MS_LOG(ERROR) << "For '" << kernel_name_ + << "', the kernel type should be in [float16, float32, float64, int32]" + ", but got: " + << kernel_attr; + return false; + } + kernel_func_ = func_list_[index].second; + unit_size_ = abstract::TypeIdSize(inputs[0]->GetDtype()); + return true; +} + +int InplaceOpV2GpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs, + const std::map &) { + for (const auto &input : inputs) { + // If any input shape contains -1, means input shape is dynamic, so just return do nothing. + auto input_shape = input->GetShapeVector(); + if (!IsValidShape(input_shape)) { + return KRET_UNKNOWN_SHAPE; + } + } + ResetResource(); + std::vector input_shape_x = std::vector(inputs.at(kIndex0)->GetDeviceShapeAdaptively().begin(), + inputs.at(kIndex0)->GetDeviceShapeAdaptively().end()); + std::vector input_shape_indices = std::vector( + inputs.at(kIndex1)->GetDeviceShapeAdaptively().begin(), inputs.at(kIndex1)->GetDeviceShapeAdaptively().end()); + std::vector input_shape_v = std::vector(inputs.at(kIndex2)->GetDeviceShapeAdaptively().begin(), + inputs.at(kIndex2)->GetDeviceShapeAdaptively().end()); + band_size_ = 1; + for (size_t i = 1; i < input_shape_x.size(); ++i) { + band_size_ *= input_shape_x[i]; + } + input_elements_x = std::accumulate(input_shape_x.begin(), input_shape_x.end(), 1, std::multiplies()); + input_elements_v = std::accumulate(input_shape_v.begin(), input_shape_v.end(), 1, std::multiplies()); + size_t input_size_x = input_elements_x * unit_size_; + size_t indices_size = input_shape_indices.size() * sizeof(int32_t); + size_t input_size_v = input_elements_v * unit_size_; + input_size_list_.push_back(input_size_x); + input_size_list_.push_back(IntToSize(indices_size)); + input_size_list_.push_back(input_size_v); + output_size_list_.push_back(input_size_x); + if (kernel_name_ == ops::kNameInplaceUpdateV2) { + workspace_size_list_.push_back(indices_size); + } + return KRET_OK; +} + +void InplaceOpV2GpuKernelMod::ResetResource() noexcept { + band_size_ = 1; + input_elements_x = 0; + input_elements_v = 0; + is_null_input_ = false; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); +} + +template +bool InplaceOpV2GpuKernelMod::LaunchKernel(const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs) { + T *input_x = GetDeviceAddress(inputs, kIndex0); + int32_t *input_indices = GetDeviceAddress(inputs, kIndex1); + T *input_v = GetDeviceAddress(inputs, kIndex2); + T *output = GetDeviceAddress(outputs, kIndex0); + int32_t *indices_key_ptr = nullptr; + if (kernel_name_ == ops::kNameInplaceUpdateV2) { + indices_key_ptr = GetDeviceAddress(workspace, kIndex0); + } + auto cuda_stream = reinterpret_cast(cuda_stream_); + + // Copy from 'x' into 'y'. + CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE( + cudaMemcpyAsync(output, input_x, input_elements_x * unit_size_, cudaMemcpyDeviceToDevice, cuda_stream), + "cudaMemcpyAsync output 'output' from 'input_x' failed."); + CalInplaceOp(input_elements_v, input_v, output, input_indices, indices_key_ptr, band_size_, device_id_, kernel_type_, + cuda_stream); + return true; +} + +std::vector> InplaceOpV2GpuKernelMod::func_list_ = { + {KernelAttr() + .AddInputAttr(kNumberTypeFloat16) + .AddInputAttr(kNumberTypeInt32) + .AddInputAttr(kNumberTypeFloat16) + .AddOutputAttr(kNumberTypeFloat16), + &InplaceOpV2GpuKernelMod::LaunchKernel}, + {KernelAttr() + .AddInputAttr(kNumberTypeFloat32) + .AddInputAttr(kNumberTypeInt32) + .AddInputAttr(kNumberTypeFloat32) + .AddOutputAttr(kNumberTypeFloat32), + &InplaceOpV2GpuKernelMod::LaunchKernel}, + {KernelAttr() + .AddInputAttr(kNumberTypeFloat64) + .AddInputAttr(kNumberTypeInt32) + .AddInputAttr(kNumberTypeFloat64) + .AddOutputAttr(kNumberTypeFloat64), + &InplaceOpV2GpuKernelMod::LaunchKernel}, + {KernelAttr() + .AddInputAttr(kNumberTypeInt32) + .AddInputAttr(kNumberTypeInt32) + .AddInputAttr(kNumberTypeInt32) + .AddOutputAttr(kNumberTypeInt32), + &InplaceOpV2GpuKernelMod::LaunchKernel}}; + +std::vector InplaceOpV2GpuKernelMod::GetOpSupport() { + std::vector support_list; + (void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list), + [](const std::pair &pair) { return pair.first; }); + return support_list; +} + +MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, InplaceUpdateV2, InplaceOpV2GpuKernelMod); +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/inplace_op_v2_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/inplace_op_v2_gpu_kernel.h new file mode 100644 index 00000000000..4f864ae7ca0 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/inplace_op_v2_gpu_kernel.h @@ -0,0 +1,83 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_INPLACE_UPDATE_V2_GPU_KERNEL_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_INPLACE_UPDATE_V2_GPU_KERNEL_H_ +#include +#include +#include +#include +#include +#include +#include +#include "mindspore/core/ops/inplace_update_v2.h" +#include "abstract/utils.h" +#include "plugin/factory/ms_factory.h" +#include "plugin/device/gpu/kernel/gpu_kernel.h" +#include "plugin/device/gpu/kernel/gpu_kernel_factory.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/inplace_op_impl.cuh" + +namespace mindspore { +namespace kernel { +class InplaceOpV2GpuKernelMod : public NativeGpuKernelMod { + public: + InplaceOpV2GpuKernelMod() { ResetResource(); } + ~InplaceOpV2GpuKernelMod() override = default; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs, void *cuda_stream) override { + if (is_null_input_) { + return true; + } + cuda_stream_ = cuda_stream; + return kernel_func_(this, inputs, workspace, outputs); + } + + bool Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs) override; + + int Resize(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs, const std::map &) override; + + std::vector GetOpSupport() override; + + void ResetResource() noexcept; + + private: + template + bool LaunchKernel(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs); + using InplaceOpFunc = + std::function &, + const std::vector &, const std::vector &)>; + + private: + std::vector indices_; + int kernel_type_{-1}; + size_t unit_size_{1}; + size_t input_elements_x; + size_t input_elements_v; + int64_t band_size_; + InplaceOpFunc kernel_func_{}; + bool is_null_input_{false}; + void *cuda_stream_{nullptr}; + static std::vector> func_list_; +}; +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_INPLACE_UPDATE_V2_GPU_KERNEL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/inplace_op_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/inplace_op_impl.cu index 518a81b9373..c4cd92ee75e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/inplace_op_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/inplace_op_impl.cu @@ -32,14 +32,14 @@ struct AddFunc { __device__ __forceinline__ void operator()(T *lhs, const T &rhs) { MsAtomicAdd(lhs, rhs); } }; -template -__global__ void InplaceUpdate(const size_t size, const T *input_v, T *output, const int64_t *indices, - int64_t *indices_key, size_t indices_len, const int64_t band_size) { +template +__global__ void InplaceUpdate(const size_t size, const T *input_v, T *output, const S *indices, S *indices_key, + size_t indices_len, const int64_t band_size) { for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { size_t row = pos / band_size; if (row == indices_len || indices[row] != indices[row + 1]) { - int x_row = indices[row]; - int v_row = indices_key[row]; + S x_row = indices[row]; + S v_row = indices_key[row]; int offset = pos % band_size; int x_offset = x_row * band_size; output[x_offset + offset] = input_v[v_row * band_size + offset]; @@ -47,12 +47,12 @@ __global__ void InplaceUpdate(const size_t size, const T *input_v, T *output, co } return; } -template -__global__ void InplaceAddOrSub(const size_t size, const T *input_v, T *output, const int64_t *indices, +template +__global__ void InplaceAddOrSub(const size_t size, const T *input_v, T *output, const S *indices, const int64_t band_size) { for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { int v_row = pos / band_size; - int x_row = indices[v_row]; + S x_row = indices[v_row]; int offset = pos % band_size; int x_offset = x_row * band_size; Func()(&output[x_offset + offset], input_v[pos]); @@ -60,9 +60,9 @@ __global__ void InplaceAddOrSub(const size_t size, const T *input_v, T *output, return; } -template -void CalInplaceOp(const size_t size_v, const T *input_v, T *output, int64_t *indices, int64_t *indices_key, - const int64_t band_size, const uint32_t &device_id, int op_type, cudaStream_t cuda_stream) { +template +void CalInplaceOp(const size_t size_v, const T *input_v, T *output, S *indices, S *indices_key, const int64_t band_size, + const uint32_t &device_id, int op_type, cudaStream_t cuda_stream) { int thread_num = 256 > size_v ? size_v : 256; if (op_type == INPLACE_OP_TYPE_UPDATE) { auto policy = thrust::cuda::par.on(cuda_stream); @@ -75,10 +75,10 @@ void CalInplaceOp(const size_t size_v, const T *input_v, T *output, int64_t *ind InplaceUpdate<<>>( size_v, input_v, output, indices, indices_key, indices_element, band_size); } else if (op_type == INPLACE_OP_TYPE_ADD) { - InplaceAddOrSub><<>>( + InplaceAddOrSub><<>>( size_v, input_v, output, indices, band_size); } else if (op_type == INPLACE_OP_TYPE_SUB) { - InplaceAddOrSub><<>>( + InplaceAddOrSub><<>>( size_v, input_v, output, indices, band_size); } return; @@ -99,3 +99,19 @@ template CUDA_LIB_EXPORT void CalInplaceOp(const size_t size_v, const do template CUDA_LIB_EXPORT void CalInplaceOp(const size_t size_v, const int *input_v, int *output, int64_t *indices, int64_t *indices_key, const int64_t band_size, const uint32_t &device_id, int op_type, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void CalInplaceOp(const size_t size_v, const half *input_v, half *output, + int32_t *indices, int32_t *indices_key, const int64_t band_size, + const uint32_t &device_id, int op_type, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void CalInplaceOp(const size_t size_v, const float *input_v, float *output, + int32_t *indices, int32_t *indices_key, const int64_t band_size, + const uint32_t &device_id, int op_type, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void CalInplaceOp(const size_t size_v, const double *input_v, double *output, + int32_t *indices, int32_t *indices_key, const int64_t band_size, + const uint32_t &device_id, int op_type, cudaStream_t cuda_stream); + +template CUDA_LIB_EXPORT void CalInplaceOp(const size_t size_v, const int *input_v, int *output, int32_t *indices, + int32_t *indices_key, const int64_t band_size, + const uint32_t &device_id, int op_type, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/inplace_op_impl.cuh b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/inplace_op_impl.cuh index b0332015eaf..8f834e44022 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/inplace_op_impl.cuh +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/inplace_op_impl.cuh @@ -25,9 +25,9 @@ enum BroadcastOpType { INPLACE_OP_TYPE_SUB = 2, }; -template -CUDA_LIB_EXPORT void CalInplaceOp(const size_t size_v, const T *input_v, T *output, int64_t *indices, - int64_t *indices_key_ptr, const int64_t band_size, const uint32_t &device_id, - int op_type, cudaStream_t cuda_stream); +template +CUDA_LIB_EXPORT void CalInplaceOp(const size_t size_v, const T *input_v, T *output, S *indices, S *indices_key_ptr, + const int64_t band_size, const uint32_t &device_id, int op_type, + cudaStream_t cuda_stream); #endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_INPLACE_UPDATE_IMPL_CUH_ diff --git a/mindspore/python/mindspore/ops/operations/__init__.py b/mindspore/python/mindspore/ops/operations/__init__.py index f737df6ba94..3f437665016 100644 --- a/mindspore/python/mindspore/ops/operations/__init__.py +++ b/mindspore/python/mindspore/ops/operations/__init__.py @@ -67,8 +67,8 @@ from .inner_ops import (ScalarCast, Randperm, NoRepeatNGram, LambApplyOptimizerA FusedAdaFactorWithGlobalNorm) from .linalg_ops import (Svd, Geqrf) from .math_ops import (Abs, ACos, Asin, Asinh, AddN, AccumulateNV2, AssignAdd, AssignSub, Atan2, BatchMatMul, - BitwiseAnd, BitwiseOr, Ger, - BitwiseXor, Inv, Invert, ApproximateEqual, InplaceAdd, InplaceSub, InplaceUpdate, + BitwiseAnd, BitwiseOr, Ger, BitwiseXor, Inv, Invert, ApproximateEqual, + InplaceAdd, InplaceSub, InplaceUpdate, InplaceUpdateV2, ReduceMax, ReduceMin, ReduceMean, ReduceSum, ReduceAll, ReduceProd, CumProd, Cdist, ReduceAny, Cos, Cross, Div, DivNoNan, Equal, EqualCount, Exp, Expm1, Erf, Erfc, Floor, FloorDiv, FloorMod, Ceil, Acosh, Greater, GreaterEqual, Lerp, Less, LessEqual, Log, Log1p, LogicalAnd, Mod, @@ -470,6 +470,7 @@ __all__ = [ "DataFormatDimMap", "ApproximateEqual", "InplaceUpdate", + "InplaceUpdateV2", "InTopK", "UniformCandidateSampler", "LogUniformCandidateSampler", diff --git a/mindspore/python/mindspore/ops/operations/math_ops.py b/mindspore/python/mindspore/ops/operations/math_ops.py index a21430fd5e6..d7b9f68b245 100644 --- a/mindspore/python/mindspore/ops/operations/math_ops.py +++ b/mindspore/python/mindspore/ops/operations/math_ops.py @@ -1838,14 +1838,14 @@ class InplaceUpdateV2(Primitive): TypeError: If `indices` is a tuple and its element is not an int. Supported Platforms: - ``Ascend`` + ``Ascend` ``GPU``` Examples: >>> indices = (0, 1) >>> x = Tensor(np.array([[1, 2], [3, 4], [5, 6]]), mindspore.float32) >>> v = Tensor(np.array([[0.5, 1.0], [1.0, 1.5]]), mindspore.float32) - >>> inplace_update = ops.InplaceUpdate(indices) - >>> output = inplace_update(x, v) + >>> inplace_update_v2 = ops.InplaceUpdateV2() + >>> output = inplace_update_v2(x, indices, v) >>> print(output) [[0.5 1. ] [1. 1.5] diff --git a/tests/st/ops/gpu/test_inplace_update_v2_op.py b/tests/st/ops/gpu/test_inplace_update_v2_op.py new file mode 100644 index 00000000000..f0eb07b5e18 --- /dev/null +++ b/tests/st/ops/gpu/test_inplace_update_v2_op.py @@ -0,0 +1,65 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# This example should be run with multiple processes. + +# Please refer to the Programming Guide > Distributed Training -> Distributed Parallel Usage Example + +# on mindspore.cn and focus on the contents of these three parts: Configuring Distributed Environment + +# Variables, Calling the Collective Communication Library, Running the Script. + +import pytest + +import mindspore +from mindspore import context +from mindspore import Tensor +from mindspore.ops import operations as P +from mindspore import nn + +context.set_context(mode=context.GRAPH_MODE, device_target="GPU") + + +class NetInplaceUpdateV2(nn.Cell): + def __init__(self, x, v): + super(NetInplaceUpdateV2, self).__init__() + self.x = x + self.v = v + self.inplace_update_v2 = P.InplaceUpdateV2() + + def construct(self, indices): + output = self.inplace_update_v2(self.x, indices, self.v) + return output + + +@pytest.mark.level1 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_inplace_update_fp16(): + """ + Feature: ALL To ALL + Description: test cases for InplaceUpdateV2 + Expectation: the result match to expect result + """ + x = Tensor([[1, 2], [3, 4], [5, 6]], mindspore.float16) + v = Tensor([[0.5, 1.0], [1.0, 1.5]], mindspore.float16) + inplace_update_v2 = NetInplaceUpdateV2(x, v) + indices = Tensor(shape=[None], dtype=mindspore.int32) + inplace_update_v2.set_inputs(indices) + real_indices = Tensor([0, 1], dtype=mindspore.int32) + + output = inplace_update_v2(real_indices) + expect = Tensor([[0.5, 1.0], [1.0, 1.5], [5, 6]], mindspore.float16) + assert (output.asnumpy() == expect).all()