!48588 fix magicnum in quant cuda kernel

Merge pull request !48588 from hangq/master
This commit is contained in:
i-robot 2023-02-09 06:41:03 +00:00 committed by Gitee
commit 364e292587
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
17 changed files with 257 additions and 196 deletions

View File

@ -21,6 +21,7 @@
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -45,15 +46,15 @@ class BatchNormFold2GpuKernelMod : public DeprecatedNativeGpuKernelMod {
return true;
}
auto *input = GetDeviceAddress<T>(inputs, 0);
auto *beta = GetDeviceAddress<T>(inputs, 1);
auto *gamma = GetDeviceAddress<T>(inputs, 2);
auto *batch_std = GetDeviceAddress<T>(inputs, 3);
auto *batch_mean = GetDeviceAddress<T>(inputs, 4);
auto *running_std = GetDeviceAddress<T>(inputs, 5);
auto *running_mean = GetDeviceAddress<T>(inputs, 6);
auto *global_step = GetDeviceAddress<int32_t>(inputs, 7);
auto *output = GetDeviceAddress<T>(outputs, 0);
auto *input = GetDeviceAddress<T>(inputs, kIndex0);
auto *beta = GetDeviceAddress<T>(inputs, kIndex1);
auto *gamma = GetDeviceAddress<T>(inputs, kIndex2);
auto *batch_std = GetDeviceAddress<T>(inputs, kIndex3);
auto *batch_mean = GetDeviceAddress<T>(inputs, kIndex4);
auto *running_std = GetDeviceAddress<T>(inputs, kIndex5);
auto *running_mean = GetDeviceAddress<T>(inputs, kIndex6);
auto *global_step = GetDeviceAddress<int32_t>(inputs, kIndex7);
auto *output = GetDeviceAddress<T>(outputs, kIndex0);
BatchNormFold2Forward(input, beta, gamma, batch_std, batch_mean, running_std, running_mean, global_step, output,
freeze_bn_, batch_size_, channel_, height_, width_,
@ -83,14 +84,14 @@ class BatchNormFold2GpuKernelMod : public DeprecatedNativeGpuKernelMod {
return true;
}
if (input_shape.size() != 4) {
if (input_shape.size() != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
<< input_shape.size();
}
batch_size_ = input_shape[0];
channel_ = input_shape[1];
height_ = input_shape[2];
width_ = input_shape[3];
batch_size_ = input_shape[kIndex0];
channel_ = input_shape[kIndex1];
height_ = input_shape[kIndex2];
width_ = input_shape[kIndex3];
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
MS_EXCEPTION_IF_NULL(prim);
freeze_bn_ = static_cast<int>(GetValue<int64_t>(prim->GetAttr("freeze_bn")));

View File

@ -21,6 +21,7 @@
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -45,23 +46,23 @@ class BatchNormFold2GradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
return true;
}
auto *dout = GetDeviceAddress<T>(inputs, 0);
auto *x = GetDeviceAddress<T>(inputs, 1);
auto *gamma = GetDeviceAddress<T>(inputs, 2);
auto *batch_std = GetDeviceAddress<T>(inputs, 3);
auto *batch_mean = GetDeviceAddress<T>(inputs, 4);
auto *running_std = GetDeviceAddress<T>(inputs, 5);
auto *running_mean = GetDeviceAddress<T>(inputs, 6);
auto *global_step = GetDeviceAddress<int32_t>(inputs, 7);
auto *d_batch_std = GetDeviceAddress<T>(outputs, 0);
auto *d_batch_mean = GetDeviceAddress<T>(outputs, 1);
auto *d_beta = GetDeviceAddress<T>(outputs, 2);
auto *d_gamma = GetDeviceAddress<T>(outputs, 3);
auto *d_x = GetDeviceAddress<T>(outputs, 4);
auto *tmp = GetDeviceAddress<T>(workspace, 0);
auto *tmp2 = GetDeviceAddress<T>(workspace, 1);
auto *reduce_x = GetDeviceAddress<T>(workspace, 2);
auto *tmp_x = GetDeviceAddress<T>(workspace, 3);
auto *dout = GetDeviceAddress<T>(inputs, kIndex0);
auto *x = GetDeviceAddress<T>(inputs, kIndex1);
auto *gamma = GetDeviceAddress<T>(inputs, kIndex2);
auto *batch_std = GetDeviceAddress<T>(inputs, kIndex3);
auto *batch_mean = GetDeviceAddress<T>(inputs, kIndex4);
auto *running_std = GetDeviceAddress<T>(inputs, kIndex5);
auto *running_mean = GetDeviceAddress<T>(inputs, kIndex6);
auto *global_step = GetDeviceAddress<int32_t>(inputs, kIndex7);
auto *d_batch_std = GetDeviceAddress<T>(outputs, kIndex0);
auto *d_batch_mean = GetDeviceAddress<T>(outputs, kIndex1);
auto *d_beta = GetDeviceAddress<T>(outputs, kIndex2);
auto *d_gamma = GetDeviceAddress<T>(outputs, kIndex3);
auto *d_x = GetDeviceAddress<T>(outputs, kIndex4);
auto *tmp = GetDeviceAddress<T>(workspace, kIndex0);
auto *tmp2 = GetDeviceAddress<T>(workspace, kIndex1);
auto *reduce_x = GetDeviceAddress<T>(workspace, kIndex2);
auto *tmp_x = GetDeviceAddress<T>(workspace, kIndex3);
int32_t current_step_host[1];
size_t x_size = batch_size_ * channel_ * height_ * width_ * sizeof(T);
@ -110,14 +111,14 @@ class BatchNormFold2GradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
return true;
}
if (input_shape.size() != 4) {
if (input_shape.size() != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
<< input_shape.size();
}
batch_size_ = input_shape[0];
channel_ = input_shape[1];
height_ = input_shape[2];
width_ = input_shape[3];
batch_size_ = input_shape[kIndex0];
channel_ = input_shape[kIndex1];
height_ = input_shape[kIndex2];
width_ = input_shape[kIndex3];
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
MS_EXCEPTION_IF_NULL(prim);
freeze_bn_ = GetValue<int64_t>(prim->GetAttr("freeze_bn"));

View File

@ -22,6 +22,7 @@
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "plugin/device/gpu/kernel/kernel_constants.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -53,21 +54,21 @@ class BatchNormFoldGpuKernelMod : public DeprecatedNativeGpuKernelMod {
return true;
}
(void)workspace;
auto x = GetDeviceAddress<T>(inputs, 0);
auto mean = GetDeviceAddress<T>(inputs, 1);
auto variance = GetDeviceAddress<T>(inputs, 2);
int *current_step = GetDeviceAddress<int>(inputs, 3);
auto x = GetDeviceAddress<T>(inputs, kIndex0);
auto mean = GetDeviceAddress<T>(inputs, kIndex1);
auto variance = GetDeviceAddress<T>(inputs, kIndex2);
int *current_step = GetDeviceAddress<int>(inputs, kIndex3);
int current_step_host[1];
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"Copy gpu memoy failed.");
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), "cudaDeviceSyncFailed");
auto batch_mean = GetDeviceAddress<T>(outputs, 0);
auto batch_std = GetDeviceAddress<T>(outputs, 1);
auto running_mean = GetDeviceAddress<T>(outputs, 2);
auto running_std = GetDeviceAddress<T>(outputs, 3);
auto y = GetDeviceAddress<T>(workspace, 0);
auto batch_mean = GetDeviceAddress<T>(outputs, kIndex0);
auto batch_std = GetDeviceAddress<T>(outputs, kIndex1);
auto running_mean = GetDeviceAddress<T>(outputs, kIndex2);
auto running_std = GetDeviceAddress<T>(outputs, kIndex3);
auto y = GetDeviceAddress<T>(workspace, kIndex0);
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(running_mean, mean, output_size_, cudaMemcpyDeviceToDevice,
@ -99,12 +100,12 @@ class BatchNormFoldGpuKernelMod : public DeprecatedNativeGpuKernelMod {
kernel_node_ = kernel_node;
InitResource();
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
if (input_num != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 4) {
if (output_num != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 4, but got " << output_num;
}
@ -122,15 +123,15 @@ class BatchNormFoldGpuKernelMod : public DeprecatedNativeGpuKernelMod {
InitSizeLists();
return true;
}
if (input_shape.size() != 4) {
if (input_shape.size() != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
<< input_shape.size();
}
CheckTensorSize({input_shape});
batch_ = LongToInt(input_shape[0]);
channel_ = LongToInt(input_shape[1]);
height_ = LongToInt(input_shape[2]);
width_ = LongToInt(input_shape[3]);
batch_ = LongToInt(input_shape[kIndex0]);
channel_ = LongToInt(input_shape[kIndex1]);
height_ = LongToInt(input_shape[kIndex2]);
width_ = LongToInt(input_shape[kIndex3]);
input_size_ = sizeof(T) * batch_ * channel_ * height_ * width_;
output_size_ = sizeof(T) * channel_;

View File

@ -21,6 +21,7 @@
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -50,19 +51,19 @@ class BatchNormFoldGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
return true;
}
// 'd_batch_mean', 'd_batch_std', 'x', 'batch_mean', 'batch_std', 'current_step'
T *d_batch_mean = GetDeviceAddress<T>(inputs, 0);
T *d_batch_std = GetDeviceAddress<T>(inputs, 1);
T *x = GetDeviceAddress<T>(inputs, 2);
T *batch_mean = GetDeviceAddress<T>(inputs, 3);
T *batch_std = GetDeviceAddress<T>(inputs, 4);
int *current_step = GetDeviceAddress<int>(inputs, 5);
T *d_batch_mean = GetDeviceAddress<T>(inputs, kIndex0);
T *d_batch_std = GetDeviceAddress<T>(inputs, kIndex1);
T *x = GetDeviceAddress<T>(inputs, kIndex2);
T *batch_mean = GetDeviceAddress<T>(inputs, kIndex3);
T *batch_std = GetDeviceAddress<T>(inputs, kIndex4);
int *current_step = GetDeviceAddress<int>(inputs, kIndex5);
int current_step_host[1];
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"Copy gpu memoy failed.");
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), "cudaDeviceSyncFailed");
T *dx = GetDeviceAddress<T>(outputs, 0);
T *dx = GetDeviceAddress<T>(outputs, kIndex0);
if (!is_training_ || current_step_host[0] >= freeze_bn_) {
ThrustFillWith(dx, batch_ * channel_ * height_ * width_, 0.f, reinterpret_cast<cudaStream_t>(stream_ptr));
@ -99,14 +100,14 @@ class BatchNormFoldGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
InitSizeLists();
return true;
}
if (input_shape.size() != 4) {
if (input_shape.size() != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
<< input_shape.size();
}
batch_ = input_shape[0];
channel_ = input_shape[1];
height_ = input_shape[2];
width_ = input_shape[3];
batch_ = input_shape[kIndex0];
channel_ = input_shape[kIndex1];
height_ = input_shape[kIndex2];
width_ = input_shape[kIndex3];
input_size_ = sizeof(T) * batch_ * channel_ * height_ * width_;
channel_size_ = sizeof(T) * channel_;

View File

@ -21,6 +21,7 @@
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -35,10 +36,10 @@ class CorrectionMulGpuKernelMod : public DeprecatedNativeGpuKernelMod {
if (is_null_input_) {
return true;
}
auto *weight = GetDeviceAddress<T>(inputs, 0);
auto *gamma = GetDeviceAddress<T>(inputs, 1);
auto *running_std = GetDeviceAddress<T>(inputs, 2);
auto *output = GetDeviceAddress<T>(outputs, 0);
auto *weight = GetDeviceAddress<T>(inputs, kIndex0);
auto *gamma = GetDeviceAddress<T>(inputs, kIndex1);
auto *running_std = GetDeviceAddress<T>(inputs, kIndex2);
auto *output = GetDeviceAddress<T>(outputs, kIndex0);
CalCorrectionMul(weight, gamma, running_std, batch_size_, channel_, height_, width_, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
@ -50,7 +51,7 @@ class CorrectionMulGpuKernelMod : public DeprecatedNativeGpuKernelMod {
InitResource();
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
if (input_num != kSize3) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
}
@ -64,14 +65,14 @@ class CorrectionMulGpuKernelMod : public DeprecatedNativeGpuKernelMod {
InitSizeLists();
return true;
}
if (input_shape.size() != 4) {
if (input_shape.size() != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
<< input_shape.size();
}
batch_size_ = input_shape[0];
channel_ = input_shape[1];
height_ = input_shape[2];
width_ = input_shape[3];
batch_size_ = input_shape[kIndex0];
channel_ = input_shape[kIndex1];
height_ = input_shape[kIndex2];
width_ = input_shape[kIndex3];
InitSizeLists();
return true;

View File

@ -21,6 +21,7 @@
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -35,13 +36,13 @@ class CorrectionMulGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
if (is_null_input_) {
return true;
}
auto *d_out = GetDeviceAddress<T>(inputs, 0);
auto *weight = GetDeviceAddress<T>(inputs, 1);
auto *gamma = GetDeviceAddress<T>(inputs, 2);
auto *running_std = GetDeviceAddress<T>(inputs, 3);
auto *d_weight = GetDeviceAddress<T>(outputs, 0);
auto *d_gamma = GetDeviceAddress<T>(outputs, 1);
auto *tmp = GetDeviceAddress<T>(workspace, 0);
auto *d_out = GetDeviceAddress<T>(inputs, kIndex0);
auto *weight = GetDeviceAddress<T>(inputs, kIndex1);
auto *gamma = GetDeviceAddress<T>(inputs, kIndex2);
auto *running_std = GetDeviceAddress<T>(inputs, kIndex3);
auto *d_weight = GetDeviceAddress<T>(outputs, kIndex0);
auto *d_gamma = GetDeviceAddress<T>(outputs, kIndex1);
auto *tmp = GetDeviceAddress<T>(workspace, kIndex0);
CalCorrectionMul(d_out, gamma, running_std, batch_size_, channel_, height_, width_, d_weight,
reinterpret_cast<cudaStream_t>(stream_ptr));
@ -56,11 +57,11 @@ class CorrectionMulGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
InitResource();
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
if (input_num != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
}
auto shape_signed = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto shape_signed = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
if (IsDynamic(shape_signed)) {
return true;
}
@ -70,14 +71,14 @@ class CorrectionMulGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
InitSizeLists();
return true;
}
if (input_shape.size() != 4) {
if (input_shape.size() != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
<< input_shape.size();
}
batch_size_ = input_shape[0];
channel_ = input_shape[1];
height_ = input_shape[2];
width_ = input_shape[3];
batch_size_ = input_shape[kIndex0];
channel_ = input_shape[kIndex1];
height_ = input_shape[kIndex2];
width_ = input_shape[kIndex3];
InitSizeLists();
return true;

View File

@ -15,11 +15,12 @@
*/
#include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
#include <cuda_runtime_api.h>
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -36,12 +37,12 @@ bool FakeLearnedScaleQuantPerChannelGpuKernelMod::Init(const CNodePtr &kernel_no
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
if (input_num != kSize3) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
if (output_num != kSize1) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
}
@ -51,7 +52,7 @@ bool FakeLearnedScaleQuantPerChannelGpuKernelMod::Init(const CNodePtr &kernel_no
neg_trunc_ = GetValue<bool>(common::AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
// init size
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
num_channels_ = LongToInt(input_shape[0]);
for (size_t i = 0; i < input_shape.size(); ++i) {
quant_num_ *= LongToInt(input_shape[i]);
@ -73,12 +74,12 @@ void FakeLearnedScaleQuantPerChannelGpuKernelMod::InitSizeLists() {
bool FakeLearnedScaleQuantPerChannelGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_alpha = GetDeviceAddress<float>(inputs, 1);
float *input_quant_max = GetDeviceAddress<float>(inputs, 2);
float *output = GetDeviceAddress<float>(outputs, 0);
float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
float *input_quant = GetDeviceAddress<float>(workspace, 1);
float *input = GetDeviceAddress<float>(inputs, kIndex0);
float *input_alpha = GetDeviceAddress<float>(inputs, kIndex1);
float *input_quant_max = GetDeviceAddress<float>(inputs, kIndex2);
float *output = GetDeviceAddress<float>(outputs, kIndex0);
float *input_div_alpha = GetDeviceAddress<float>(workspace, kIndex0);
float *input_quant = GetDeviceAddress<float>(workspace, kIndex1);
MS_EXCEPTION_IF_NULL(input);
MS_EXCEPTION_IF_NULL(input_alpha);

View File

@ -15,6 +15,7 @@
*/
#include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -31,12 +32,12 @@ bool FakeLearnedScaleQuantPerChannelGradGpuKernelMod::Init(const CNodePtr &kerne
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
if (input_num != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
if (output_num != kSize2) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 2, but got " << output_num;
}
@ -50,7 +51,7 @@ bool FakeLearnedScaleQuantPerChannelGradGpuKernelMod::Init(const CNodePtr &kerne
neg_trunc_ = GetValue<bool>(common::AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
// init size
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
num_channels_ = LongToInt(input_shape[0]);
for (size_t i = 0; i < input_shape.size(); ++i) {
quant_num_ *= LongToInt(input_shape[i]);
@ -74,14 +75,14 @@ void FakeLearnedScaleQuantPerChannelGradGpuKernelMod::InitSizeLists() {
bool FakeLearnedScaleQuantPerChannelGradGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
float *grad_input = GetDeviceAddress<float>(outputs, 0);
float *grad_alpha = GetDeviceAddress<float>(outputs, 1);
float *gradient = GetDeviceAddress<float>(inputs, 0);
float *input = GetDeviceAddress<float>(inputs, 1);
float *input_alpha = GetDeviceAddress<float>(inputs, 2);
float *input_quant_max = GetDeviceAddress<float>(inputs, 3);
float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
float *input_quant = GetDeviceAddress<float>(workspace, 1);
float *grad_input = GetDeviceAddress<float>(outputs, kIndex0);
float *grad_alpha = GetDeviceAddress<float>(outputs, kIndex1);
float *gradient = GetDeviceAddress<float>(inputs, kIndex0);
float *input = GetDeviceAddress<float>(inputs, kIndex1);
float *input_alpha = GetDeviceAddress<float>(inputs, kIndex2);
float *input_quant_max = GetDeviceAddress<float>(inputs, kIndex3);
float *input_div_alpha = GetDeviceAddress<float>(workspace, kIndex0);
float *input_quant = GetDeviceAddress<float>(workspace, kIndex1);
MS_EXCEPTION_IF_NULL(grad_input);
MS_EXCEPTION_IF_NULL(grad_alpha);

View File

@ -15,11 +15,12 @@
*/
#include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
#include <cuda_runtime_api.h>
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -30,12 +31,12 @@ bool FakeLearnedScaleQuantPerLayerGpuKernelMod::Init(const CNodePtr &kernel_node
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
if (input_num != kSize3) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
if (output_num != kSize1) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
}
@ -45,7 +46,7 @@ bool FakeLearnedScaleQuantPerLayerGpuKernelMod::Init(const CNodePtr &kernel_node
neg_trunc_ = GetValue<bool>(common::AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
// init size
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
for (size_t i = 0; i < input_shape.size(); ++i) {
quant_num_ *= LongToInt(input_shape[i]);
}
@ -66,12 +67,12 @@ void FakeLearnedScaleQuantPerLayerGpuKernelMod::InitSizeLists() {
bool FakeLearnedScaleQuantPerLayerGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_alpha = GetDeviceAddress<float>(inputs, 1);
float *input_quant_max = GetDeviceAddress<float>(inputs, 2);
float *output = GetDeviceAddress<float>(outputs, 0);
float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
float *input_quant = GetDeviceAddress<float>(workspace, 1);
float *input = GetDeviceAddress<float>(inputs, kIndex0);
float *input_alpha = GetDeviceAddress<float>(inputs, kIndex1);
float *input_quant_max = GetDeviceAddress<float>(inputs, kIndex2);
float *output = GetDeviceAddress<float>(outputs, kIndex0);
float *input_div_alpha = GetDeviceAddress<float>(workspace, kIndex0);
float *input_quant = GetDeviceAddress<float>(workspace, kIndex1);
MS_EXCEPTION_IF_NULL(input);
MS_EXCEPTION_IF_NULL(input_alpha);

View File

@ -15,6 +15,7 @@
*/
#include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -25,12 +26,12 @@ bool FakeLearnedScaleQuantPerLayerGradGpuKernelMod::Init(const CNodePtr &kernel_
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
if (input_num != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
if (output_num != kSize2) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 2, but got " << output_num;
}
@ -44,7 +45,7 @@ bool FakeLearnedScaleQuantPerLayerGradGpuKernelMod::Init(const CNodePtr &kernel_
neg_trunc_ = GetValue<bool>(common::AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
// init size
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
auto size = SizeOf(input_shape);
quant_num_ = SizeToInt(size);
input_size_ = sizeof(float) * size;
@ -66,14 +67,14 @@ void FakeLearnedScaleQuantPerLayerGradGpuKernelMod::InitSizeLists() {
bool FakeLearnedScaleQuantPerLayerGradGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
float *grad_input = GetDeviceAddress<float>(outputs, 0);
float *grad_alpha = GetDeviceAddress<float>(outputs, 1);
float *gradient = GetDeviceAddress<float>(inputs, 0);
float *input = GetDeviceAddress<float>(inputs, 1);
float *input_alpha = GetDeviceAddress<float>(inputs, 2);
float *input_quant_max = GetDeviceAddress<float>(inputs, 3);
float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
float *input_quant = GetDeviceAddress<float>(workspace, 1);
float *grad_input = GetDeviceAddress<float>(outputs, kIndex0);
float *grad_alpha = GetDeviceAddress<float>(outputs, kIndex1);
float *gradient = GetDeviceAddress<float>(inputs, kIndex0);
float *input = GetDeviceAddress<float>(inputs, kIndex1);
float *input_alpha = GetDeviceAddress<float>(inputs, kIndex2);
float *input_quant_max = GetDeviceAddress<float>(inputs, kIndex3);
float *input_div_alpha = GetDeviceAddress<float>(workspace, kIndex0);
float *input_quant = GetDeviceAddress<float>(workspace, kIndex1);
MS_EXCEPTION_IF_NULL(grad_input);
MS_EXCEPTION_IF_NULL(grad_alpha);

View File

@ -15,11 +15,12 @@
*/
#include "plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
#include <cuda_runtime_api.h>
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -40,12 +41,12 @@ bool FakeQuantPerChannelGpuKernelMod::Init(const CNodePtr &kernel_node) {
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
if (input_num != kSize3) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
if (output_num != kSize1) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
}
@ -57,9 +58,9 @@ bool FakeQuantPerChannelGpuKernelMod::Init(const CNodePtr &kernel_node) {
symmetric_ = GetValue<bool>(prim->GetAttr("symmetric"));
narrow_range_ = GetValue<bool>(prim->GetAttr("narrow_range"));
quant_delay_ = static_cast<int>(GetValue<int64_t>(prim->GetAttr("quant_delay")));
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
if (num_bits_ <= 2 || num_bits_ >= 16) {
if (num_bits_ <= kMinQuantBit || num_bits_ >= kMaxQuantBit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of num_bits should be in (2, 16), but got "
<< num_bits_;
}
@ -117,13 +118,13 @@ bool FakeQuantPerChannelGpuKernelMod::Launch(const std::vector<AddressPtr> &inpu
return true;
}
(void)workspace;
float *output = GetDeviceAddress<float>(outputs, 0);
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_min = GetDeviceAddress<float>(inputs, 1);
float *input_max = GetDeviceAddress<float>(inputs, 2);
float *scale = GetDeviceAddress<float>(workspace, 0);
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
float *output = GetDeviceAddress<float>(outputs, kIndex0);
float *input = GetDeviceAddress<float>(inputs, kIndex0);
float *input_min = GetDeviceAddress<float>(inputs, kIndex1);
float *input_max = GetDeviceAddress<float>(inputs, kIndex2);
float *scale = GetDeviceAddress<float>(workspace, kIndex0);
float *nudge_min = GetDeviceAddress<float>(workspace, kIndex1);
float *nudge_max = GetDeviceAddress<float>(workspace, kIndex2);
if (training_) {
if (global_step_ >= quant_delay_) {

View File

@ -16,6 +16,7 @@
#include "plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -35,19 +36,19 @@ bool FakeQuantPerChannelGradGpuKernelMod::Init(const CNodePtr &kernel_node) {
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
if (input_num != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
if (output_num != kSize1) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
}
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
MS_EXCEPTION_IF_NULL(prim);
num_bits_ = static_cast<unsigned int>(GetValue<int64_t>(prim->GetAttr("num_bits")));
if (num_bits_ <= 2 || num_bits_ >= 16) {
if (num_bits_ <= kMinQuantBit || num_bits_ >= kMaxQuantBit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of num_bits should be in (2, 16), but got "
<< num_bits_;
}
@ -68,7 +69,7 @@ bool FakeQuantPerChannelGradGpuKernelMod::Init(const CNodePtr &kernel_node) {
quant_min_++;
}
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
if (is_null_input_) {
InitSizeLists();
@ -101,14 +102,14 @@ bool FakeQuantPerChannelGradGpuKernelMod::Launch(const std::vector<AddressPtr> &
return true;
}
(void)workspace;
float *output = GetDeviceAddress<float>(outputs, 0);
float *gradient = GetDeviceAddress<float>(inputs, 0);
float *input = GetDeviceAddress<float>(inputs, 1);
float *input_min = GetDeviceAddress<float>(inputs, 2);
float *input_max = GetDeviceAddress<float>(inputs, 3);
float *scale = GetDeviceAddress<float>(workspace, 0);
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
float *output = GetDeviceAddress<float>(outputs, kIndex0);
float *gradient = GetDeviceAddress<float>(inputs, kIndex0);
float *input = GetDeviceAddress<float>(inputs, kIndex1);
float *input_min = GetDeviceAddress<float>(inputs, kIndex2);
float *input_max = GetDeviceAddress<float>(inputs, kIndex3);
float *scale = GetDeviceAddress<float>(workspace, kIndex0);
float *nudge_min = GetDeviceAddress<float>(workspace, kIndex1);
float *nudge_max = GetDeviceAddress<float>(workspace, kIndex2);
int total_size = input_size_ / sizeof(float);
if (global_step_ >= quant_delay_) {

View File

@ -15,11 +15,12 @@
*/
#include "plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
#include <cuda_runtime_api.h>
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -40,12 +41,12 @@ bool FakeQuantPerLayerGpuKernelMod::Init(const CNodePtr &kernel_node) {
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
if (input_num != kSize3) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
if (output_num != kSize1) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
}
@ -57,7 +58,7 @@ bool FakeQuantPerLayerGpuKernelMod::Init(const CNodePtr &kernel_node) {
symmetric_ = GetValue<bool>(prim->GetAttr("symmetric"));
narrow_range_ = GetValue<bool>(prim->GetAttr("narrow_range"));
if (num_bits_ <= 2 || num_bits_ >= 16) {
if (num_bits_ <= kMinQuantBit || num_bits_ >= kMaxQuantBit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of num_bits should be in (2, 16), but got "
<< num_bits_;
}
@ -74,7 +75,7 @@ bool FakeQuantPerLayerGpuKernelMod::Init(const CNodePtr &kernel_node) {
}
// init size
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
if (is_null_input_) {
InitSizeLists();
@ -103,13 +104,13 @@ bool FakeQuantPerLayerGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs
if (is_null_input_) {
return true;
}
float *output = GetDeviceAddress<float>(outputs, 0);
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_min = GetDeviceAddress<float>(inputs, 1);
float *input_max = GetDeviceAddress<float>(inputs, 2);
float *scale = GetDeviceAddress<float>(workspace, 0);
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
float *output = GetDeviceAddress<float>(outputs, kIndex0);
float *input = GetDeviceAddress<float>(inputs, kIndex0);
float *input_min = GetDeviceAddress<float>(inputs, kIndex1);
float *input_max = GetDeviceAddress<float>(inputs, kIndex2);
float *scale = GetDeviceAddress<float>(workspace, kIndex0);
float *nudge_min = GetDeviceAddress<float>(workspace, kIndex1);
float *nudge_max = GetDeviceAddress<float>(workspace, kIndex2);
if (training_) {
// control flow for quant_delay

View File

@ -16,6 +16,7 @@
#include "plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -36,19 +37,19 @@ bool FakeQuantPerLayerGradGpuKernelMod::Init(const CNodePtr &kernel_node) {
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
if (input_num != kSize4) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
if (output_num != kSize1) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
}
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
MS_EXCEPTION_IF_NULL(prim);
num_bits_ = static_cast<unsigned int>(GetValue<int64_t>(prim->GetAttr("num_bits")));
if (num_bits_ <= 2 || num_bits_ >= 16) {
if (num_bits_ <= kMinQuantBit || num_bits_ >= kMaxQuantBit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of num_bits should be in (2, 16), but got "
<< num_bits_;
}
@ -70,7 +71,7 @@ bool FakeQuantPerLayerGradGpuKernelMod::Init(const CNodePtr &kernel_node) {
}
// init size
auto input_shape = Convert2SizeT(common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0));
auto input_shape = Convert2SizeT(common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0));
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
if (is_null_input_) {
InitSizeLists();
@ -104,14 +105,14 @@ bool FakeQuantPerLayerGradGpuKernelMod::Launch(const std::vector<AddressPtr> &in
if (is_null_input_) {
return true;
}
float *output = GetDeviceAddress<float>(outputs, 0);
float *gradient = GetDeviceAddress<float>(inputs, 0);
float *input = GetDeviceAddress<float>(inputs, 1);
float *input_min = GetDeviceAddress<float>(inputs, 2);
float *input_max = GetDeviceAddress<float>(inputs, 3);
float *scale = GetDeviceAddress<float>(workspace, 0);
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
float *output = GetDeviceAddress<float>(outputs, kIndex0);
float *gradient = GetDeviceAddress<float>(inputs, kIndex0);
float *input = GetDeviceAddress<float>(inputs, kIndex1);
float *input_min = GetDeviceAddress<float>(inputs, kIndex2);
float *input_max = GetDeviceAddress<float>(inputs, kIndex3);
float *scale = GetDeviceAddress<float>(workspace, kIndex0);
float *nudge_min = GetDeviceAddress<float>(workspace, kIndex1);
float *nudge_max = GetDeviceAddress<float>(workspace, kIndex2);
if (global_step_ >= quant_delay_) {
CalNudgePerLayer(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale, symmetric_,

View File

@ -15,11 +15,12 @@
*/
#include "plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
#include <cuda_runtime_api.h>
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -30,12 +31,12 @@ bool MinMaxUpdatePerChannelGpuKernelMod::Init(const CNodePtr &kernel_node) {
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
kernel_node_ = kernel_node;
if (input_num != 3) {
if (input_num != kSize3) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
if (output_num != kSize2) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 2, but got " << output_num;
}
@ -45,7 +46,7 @@ bool MinMaxUpdatePerChannelGpuKernelMod::Init(const CNodePtr &kernel_node) {
ema_decay_ = GetValue<float>(prim->GetAttr("ema_decay"));
// init size
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
if (is_null_input_) {
InitSizeLists();
@ -75,11 +76,11 @@ bool MinMaxUpdatePerChannelGpuKernelMod::Launch(const std::vector<AddressPtr> &i
if (is_null_input_) {
return true;
}
float *output_min = GetDeviceAddress<float>(outputs, 0);
float *output_max = GetDeviceAddress<float>(outputs, 1);
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_min = GetDeviceAddress<float>(inputs, 1);
float *input_max = GetDeviceAddress<float>(inputs, 2);
float *output_min = GetDeviceAddress<float>(outputs, kIndex0);
float *output_max = GetDeviceAddress<float>(outputs, kIndex1);
float *input = GetDeviceAddress<float>(inputs, kIndex0);
float *input_min = GetDeviceAddress<float>(inputs, kIndex1);
float *input_max = GetDeviceAddress<float>(inputs, kIndex2);
// calculate the input min and max according by the parameter ema and ema_decay.
CalMinMaxPerChannel(input, input_min, input_max, output_min, output_max, input_size_ / sizeof(float), num_channels_,

View File

@ -15,11 +15,12 @@
*/
#include "plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
#include <cuda_runtime_api.h>
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh"
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
namespace mindspore {
namespace kernel {
@ -30,12 +31,12 @@ bool MinMaxUpdatePerLayerGpuKernelMod::Init(const CNodePtr &kernel_node) {
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
kernel_node_ = kernel_node;
if (input_num != 3) {
if (input_num != kSize3) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
if (output_num != kSize2) {
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 2, but got " << output_num;
}
@ -45,7 +46,7 @@ bool MinMaxUpdatePerLayerGpuKernelMod::Init(const CNodePtr &kernel_node) {
ema_decay_ = GetValue<float>(prim->GetAttr("ema_decay"));
// init size
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
if (is_null_input_) {
InitSizeLists();
@ -71,11 +72,11 @@ bool MinMaxUpdatePerLayerGpuKernelMod::Launch(const std::vector<AddressPtr> &inp
if (is_null_input_) {
return true;
}
float *output_min = GetDeviceAddress<float>(outputs, 0);
float *output_max = GetDeviceAddress<float>(outputs, 1);
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_min = GetDeviceAddress<float>(inputs, 1);
float *input_max = GetDeviceAddress<float>(inputs, 2);
float *output_min = GetDeviceAddress<float>(outputs, kIndex0);
float *output_max = GetDeviceAddress<float>(outputs, kIndex1);
float *input = GetDeviceAddress<float>(inputs, kIndex0);
float *input_min = GetDeviceAddress<float>(inputs, kIndex1);
float *input_max = GetDeviceAddress<float>(inputs, kIndex2);
CalMinMaxPerLayer(input, input_min, input_max, output_min, output_max, quant_num_, ema_decay_, ema_,
reinterpret_cast<cudaStream_t>(stream_ptr));

View File

@ -0,0 +1,45 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_QUANT_QUANT_OP_CONST_H_
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_QUANT_QUANT_OP_CONST_H_
namespace mindspore {
namespace kernel {
constexpr int kIndex0 = 0;
constexpr int kIndex1 = 1;
constexpr int kIndex2 = 2;
constexpr int kIndex3 = 3;
constexpr int kIndex4 = 4;
constexpr int kIndex5 = 5;
constexpr int kIndex6 = 6;
constexpr int kIndex7 = 7;
constexpr int kSize0 = 0;
constexpr int kSize1 = 1;
constexpr int kSize2 = 2;
constexpr int kSize3 = 3;
constexpr int kSize4 = 4;
constexpr int kSize5 = 5;
constexpr int kSize6 = 6;
constexpr int kSize7 = 7;
constexpr int kMinQuantBit = 2;
constexpr int kMaxQuantBit = 16;
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_QUANT_QUANT_OP_CONST_H_