forked from mindspore-Ecosystem/mindspore
!48588 fix magicnum in quant cuda kernel
Merge pull request !48588 from hangq/master
This commit is contained in:
commit
364e292587
|
@ -21,6 +21,7 @@
|
|||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -45,15 +46,15 @@ class BatchNormFold2GpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
return true;
|
||||
}
|
||||
|
||||
auto *input = GetDeviceAddress<T>(inputs, 0);
|
||||
auto *beta = GetDeviceAddress<T>(inputs, 1);
|
||||
auto *gamma = GetDeviceAddress<T>(inputs, 2);
|
||||
auto *batch_std = GetDeviceAddress<T>(inputs, 3);
|
||||
auto *batch_mean = GetDeviceAddress<T>(inputs, 4);
|
||||
auto *running_std = GetDeviceAddress<T>(inputs, 5);
|
||||
auto *running_mean = GetDeviceAddress<T>(inputs, 6);
|
||||
auto *global_step = GetDeviceAddress<int32_t>(inputs, 7);
|
||||
auto *output = GetDeviceAddress<T>(outputs, 0);
|
||||
auto *input = GetDeviceAddress<T>(inputs, kIndex0);
|
||||
auto *beta = GetDeviceAddress<T>(inputs, kIndex1);
|
||||
auto *gamma = GetDeviceAddress<T>(inputs, kIndex2);
|
||||
auto *batch_std = GetDeviceAddress<T>(inputs, kIndex3);
|
||||
auto *batch_mean = GetDeviceAddress<T>(inputs, kIndex4);
|
||||
auto *running_std = GetDeviceAddress<T>(inputs, kIndex5);
|
||||
auto *running_mean = GetDeviceAddress<T>(inputs, kIndex6);
|
||||
auto *global_step = GetDeviceAddress<int32_t>(inputs, kIndex7);
|
||||
auto *output = GetDeviceAddress<T>(outputs, kIndex0);
|
||||
|
||||
BatchNormFold2Forward(input, beta, gamma, batch_std, batch_mean, running_std, running_mean, global_step, output,
|
||||
freeze_bn_, batch_size_, channel_, height_, width_,
|
||||
|
@ -83,14 +84,14 @@ class BatchNormFold2GpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
return true;
|
||||
}
|
||||
|
||||
if (input_shape.size() != 4) {
|
||||
if (input_shape.size() != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
batch_size_ = input_shape[0];
|
||||
channel_ = input_shape[1];
|
||||
height_ = input_shape[2];
|
||||
width_ = input_shape[3];
|
||||
batch_size_ = input_shape[kIndex0];
|
||||
channel_ = input_shape[kIndex1];
|
||||
height_ = input_shape[kIndex2];
|
||||
width_ = input_shape[kIndex3];
|
||||
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(prim);
|
||||
freeze_bn_ = static_cast<int>(GetValue<int64_t>(prim->GetAttr("freeze_bn")));
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold2_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -45,23 +46,23 @@ class BatchNormFold2GradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
return true;
|
||||
}
|
||||
|
||||
auto *dout = GetDeviceAddress<T>(inputs, 0);
|
||||
auto *x = GetDeviceAddress<T>(inputs, 1);
|
||||
auto *gamma = GetDeviceAddress<T>(inputs, 2);
|
||||
auto *batch_std = GetDeviceAddress<T>(inputs, 3);
|
||||
auto *batch_mean = GetDeviceAddress<T>(inputs, 4);
|
||||
auto *running_std = GetDeviceAddress<T>(inputs, 5);
|
||||
auto *running_mean = GetDeviceAddress<T>(inputs, 6);
|
||||
auto *global_step = GetDeviceAddress<int32_t>(inputs, 7);
|
||||
auto *d_batch_std = GetDeviceAddress<T>(outputs, 0);
|
||||
auto *d_batch_mean = GetDeviceAddress<T>(outputs, 1);
|
||||
auto *d_beta = GetDeviceAddress<T>(outputs, 2);
|
||||
auto *d_gamma = GetDeviceAddress<T>(outputs, 3);
|
||||
auto *d_x = GetDeviceAddress<T>(outputs, 4);
|
||||
auto *tmp = GetDeviceAddress<T>(workspace, 0);
|
||||
auto *tmp2 = GetDeviceAddress<T>(workspace, 1);
|
||||
auto *reduce_x = GetDeviceAddress<T>(workspace, 2);
|
||||
auto *tmp_x = GetDeviceAddress<T>(workspace, 3);
|
||||
auto *dout = GetDeviceAddress<T>(inputs, kIndex0);
|
||||
auto *x = GetDeviceAddress<T>(inputs, kIndex1);
|
||||
auto *gamma = GetDeviceAddress<T>(inputs, kIndex2);
|
||||
auto *batch_std = GetDeviceAddress<T>(inputs, kIndex3);
|
||||
auto *batch_mean = GetDeviceAddress<T>(inputs, kIndex4);
|
||||
auto *running_std = GetDeviceAddress<T>(inputs, kIndex5);
|
||||
auto *running_mean = GetDeviceAddress<T>(inputs, kIndex6);
|
||||
auto *global_step = GetDeviceAddress<int32_t>(inputs, kIndex7);
|
||||
auto *d_batch_std = GetDeviceAddress<T>(outputs, kIndex0);
|
||||
auto *d_batch_mean = GetDeviceAddress<T>(outputs, kIndex1);
|
||||
auto *d_beta = GetDeviceAddress<T>(outputs, kIndex2);
|
||||
auto *d_gamma = GetDeviceAddress<T>(outputs, kIndex3);
|
||||
auto *d_x = GetDeviceAddress<T>(outputs, kIndex4);
|
||||
auto *tmp = GetDeviceAddress<T>(workspace, kIndex0);
|
||||
auto *tmp2 = GetDeviceAddress<T>(workspace, kIndex1);
|
||||
auto *reduce_x = GetDeviceAddress<T>(workspace, kIndex2);
|
||||
auto *tmp_x = GetDeviceAddress<T>(workspace, kIndex3);
|
||||
|
||||
int32_t current_step_host[1];
|
||||
size_t x_size = batch_size_ * channel_ * height_ * width_ * sizeof(T);
|
||||
|
@ -110,14 +111,14 @@ class BatchNormFold2GradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
return true;
|
||||
}
|
||||
|
||||
if (input_shape.size() != 4) {
|
||||
if (input_shape.size() != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
batch_size_ = input_shape[0];
|
||||
channel_ = input_shape[1];
|
||||
height_ = input_shape[2];
|
||||
width_ = input_shape[3];
|
||||
batch_size_ = input_shape[kIndex0];
|
||||
channel_ = input_shape[kIndex1];
|
||||
height_ = input_shape[kIndex2];
|
||||
width_ = input_shape[kIndex3];
|
||||
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(prim);
|
||||
freeze_bn_ = GetValue<int64_t>(prim->GetAttr("freeze_bn"));
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/kernel_constants.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -53,21 +54,21 @@ class BatchNormFoldGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
return true;
|
||||
}
|
||||
(void)workspace;
|
||||
auto x = GetDeviceAddress<T>(inputs, 0);
|
||||
auto mean = GetDeviceAddress<T>(inputs, 1);
|
||||
auto variance = GetDeviceAddress<T>(inputs, 2);
|
||||
int *current_step = GetDeviceAddress<int>(inputs, 3);
|
||||
auto x = GetDeviceAddress<T>(inputs, kIndex0);
|
||||
auto mean = GetDeviceAddress<T>(inputs, kIndex1);
|
||||
auto variance = GetDeviceAddress<T>(inputs, kIndex2);
|
||||
int *current_step = GetDeviceAddress<int>(inputs, kIndex3);
|
||||
int current_step_host[1];
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Copy gpu memoy failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), "cudaDeviceSyncFailed");
|
||||
auto batch_mean = GetDeviceAddress<T>(outputs, 0);
|
||||
auto batch_std = GetDeviceAddress<T>(outputs, 1);
|
||||
auto running_mean = GetDeviceAddress<T>(outputs, 2);
|
||||
auto running_std = GetDeviceAddress<T>(outputs, 3);
|
||||
auto y = GetDeviceAddress<T>(workspace, 0);
|
||||
auto batch_mean = GetDeviceAddress<T>(outputs, kIndex0);
|
||||
auto batch_std = GetDeviceAddress<T>(outputs, kIndex1);
|
||||
auto running_mean = GetDeviceAddress<T>(outputs, kIndex2);
|
||||
auto running_std = GetDeviceAddress<T>(outputs, kIndex3);
|
||||
auto y = GetDeviceAddress<T>(workspace, kIndex0);
|
||||
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(running_mean, mean, output_size_, cudaMemcpyDeviceToDevice,
|
||||
|
@ -99,12 +100,12 @@ class BatchNormFoldGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
if (input_num != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 4) {
|
||||
if (output_num != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 4, but got " << output_num;
|
||||
}
|
||||
|
||||
|
@ -122,15 +123,15 @@ class BatchNormFoldGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (input_shape.size() != 4) {
|
||||
if (input_shape.size() != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
CheckTensorSize({input_shape});
|
||||
batch_ = LongToInt(input_shape[0]);
|
||||
channel_ = LongToInt(input_shape[1]);
|
||||
height_ = LongToInt(input_shape[2]);
|
||||
width_ = LongToInt(input_shape[3]);
|
||||
batch_ = LongToInt(input_shape[kIndex0]);
|
||||
channel_ = LongToInt(input_shape[kIndex1]);
|
||||
height_ = LongToInt(input_shape[kIndex2]);
|
||||
width_ = LongToInt(input_shape[kIndex3]);
|
||||
|
||||
input_size_ = sizeof(T) * batch_ * channel_ * height_ * width_;
|
||||
output_size_ = sizeof(T) * channel_;
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchnorm_fold_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -50,19 +51,19 @@ class BatchNormFoldGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
return true;
|
||||
}
|
||||
// 'd_batch_mean', 'd_batch_std', 'x', 'batch_mean', 'batch_std', 'current_step'
|
||||
T *d_batch_mean = GetDeviceAddress<T>(inputs, 0);
|
||||
T *d_batch_std = GetDeviceAddress<T>(inputs, 1);
|
||||
T *x = GetDeviceAddress<T>(inputs, 2);
|
||||
T *batch_mean = GetDeviceAddress<T>(inputs, 3);
|
||||
T *batch_std = GetDeviceAddress<T>(inputs, 4);
|
||||
int *current_step = GetDeviceAddress<int>(inputs, 5);
|
||||
T *d_batch_mean = GetDeviceAddress<T>(inputs, kIndex0);
|
||||
T *d_batch_std = GetDeviceAddress<T>(inputs, kIndex1);
|
||||
T *x = GetDeviceAddress<T>(inputs, kIndex2);
|
||||
T *batch_mean = GetDeviceAddress<T>(inputs, kIndex3);
|
||||
T *batch_std = GetDeviceAddress<T>(inputs, kIndex4);
|
||||
int *current_step = GetDeviceAddress<int>(inputs, kIndex5);
|
||||
int current_step_host[1];
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Copy gpu memoy failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), "cudaDeviceSyncFailed");
|
||||
T *dx = GetDeviceAddress<T>(outputs, 0);
|
||||
T *dx = GetDeviceAddress<T>(outputs, kIndex0);
|
||||
|
||||
if (!is_training_ || current_step_host[0] >= freeze_bn_) {
|
||||
ThrustFillWith(dx, batch_ * channel_ * height_ * width_, 0.f, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
|
@ -99,14 +100,14 @@ class BatchNormFoldGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (input_shape.size() != 4) {
|
||||
if (input_shape.size() != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
batch_ = input_shape[0];
|
||||
channel_ = input_shape[1];
|
||||
height_ = input_shape[2];
|
||||
width_ = input_shape[3];
|
||||
batch_ = input_shape[kIndex0];
|
||||
channel_ = input_shape[kIndex1];
|
||||
height_ = input_shape[kIndex2];
|
||||
width_ = input_shape[kIndex3];
|
||||
|
||||
input_size_ = sizeof(T) * batch_ * channel_ * height_ * width_;
|
||||
channel_size_ = sizeof(T) * channel_;
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -35,10 +36,10 @@ class CorrectionMulGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
auto *weight = GetDeviceAddress<T>(inputs, 0);
|
||||
auto *gamma = GetDeviceAddress<T>(inputs, 1);
|
||||
auto *running_std = GetDeviceAddress<T>(inputs, 2);
|
||||
auto *output = GetDeviceAddress<T>(outputs, 0);
|
||||
auto *weight = GetDeviceAddress<T>(inputs, kIndex0);
|
||||
auto *gamma = GetDeviceAddress<T>(inputs, kIndex1);
|
||||
auto *running_std = GetDeviceAddress<T>(inputs, kIndex2);
|
||||
auto *output = GetDeviceAddress<T>(outputs, kIndex0);
|
||||
|
||||
CalCorrectionMul(weight, gamma, running_std, batch_size_, channel_, height_, width_, output,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
|
@ -50,7 +51,7 @@ class CorrectionMulGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
InitResource();
|
||||
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
if (input_num != kSize3) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
|
||||
|
@ -64,14 +65,14 @@ class CorrectionMulGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (input_shape.size() != 4) {
|
||||
if (input_shape.size() != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
batch_size_ = input_shape[0];
|
||||
channel_ = input_shape[1];
|
||||
height_ = input_shape[2];
|
||||
width_ = input_shape[3];
|
||||
batch_size_ = input_shape[kIndex0];
|
||||
channel_ = input_shape[kIndex1];
|
||||
height_ = input_shape[kIndex2];
|
||||
width_ = input_shape[kIndex3];
|
||||
|
||||
InitSizeLists();
|
||||
return true;
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/correction_mul_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -35,13 +36,13 @@ class CorrectionMulGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
auto *d_out = GetDeviceAddress<T>(inputs, 0);
|
||||
auto *weight = GetDeviceAddress<T>(inputs, 1);
|
||||
auto *gamma = GetDeviceAddress<T>(inputs, 2);
|
||||
auto *running_std = GetDeviceAddress<T>(inputs, 3);
|
||||
auto *d_weight = GetDeviceAddress<T>(outputs, 0);
|
||||
auto *d_gamma = GetDeviceAddress<T>(outputs, 1);
|
||||
auto *tmp = GetDeviceAddress<T>(workspace, 0);
|
||||
auto *d_out = GetDeviceAddress<T>(inputs, kIndex0);
|
||||
auto *weight = GetDeviceAddress<T>(inputs, kIndex1);
|
||||
auto *gamma = GetDeviceAddress<T>(inputs, kIndex2);
|
||||
auto *running_std = GetDeviceAddress<T>(inputs, kIndex3);
|
||||
auto *d_weight = GetDeviceAddress<T>(outputs, kIndex0);
|
||||
auto *d_gamma = GetDeviceAddress<T>(outputs, kIndex1);
|
||||
auto *tmp = GetDeviceAddress<T>(workspace, kIndex0);
|
||||
|
||||
CalCorrectionMul(d_out, gamma, running_std, batch_size_, channel_, height_, width_, d_weight,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
|
@ -56,11 +57,11 @@ class CorrectionMulGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
InitResource();
|
||||
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
if (input_num != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
|
||||
}
|
||||
|
||||
auto shape_signed = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto shape_signed = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
|
||||
if (IsDynamic(shape_signed)) {
|
||||
return true;
|
||||
}
|
||||
|
@ -70,14 +71,14 @@ class CorrectionMulGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
|||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (input_shape.size() != 4) {
|
||||
if (input_shape.size() != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
batch_size_ = input_shape[0];
|
||||
channel_ = input_shape[1];
|
||||
height_ = input_shape[2];
|
||||
width_ = input_shape[3];
|
||||
batch_size_ = input_shape[kIndex0];
|
||||
channel_ = input_shape[kIndex1];
|
||||
height_ = input_shape[kIndex2];
|
||||
width_ = input_shape[kIndex3];
|
||||
|
||||
InitSizeLists();
|
||||
return true;
|
||||
|
|
|
@ -15,11 +15,12 @@
|
|||
*/
|
||||
|
||||
#include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh"
|
||||
#include <thrust/extrema.h>
|
||||
#include <thrust/pair.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -36,12 +37,12 @@ bool FakeLearnedScaleQuantPerChannelGpuKernelMod::Init(const CNodePtr &kernel_no
|
|||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
if (input_num != kSize3) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
if (output_num != kSize1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
|
||||
|
@ -51,7 +52,7 @@ bool FakeLearnedScaleQuantPerChannelGpuKernelMod::Init(const CNodePtr &kernel_no
|
|||
neg_trunc_ = GetValue<bool>(common::AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
|
||||
|
||||
// init size
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
|
||||
num_channels_ = LongToInt(input_shape[0]);
|
||||
for (size_t i = 0; i < input_shape.size(); ++i) {
|
||||
quant_num_ *= LongToInt(input_shape[i]);
|
||||
|
@ -73,12 +74,12 @@ void FakeLearnedScaleQuantPerChannelGpuKernelMod::InitSizeLists() {
|
|||
bool FakeLearnedScaleQuantPerChannelGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
float *input = GetDeviceAddress<float>(inputs, 0);
|
||||
float *input_alpha = GetDeviceAddress<float>(inputs, 1);
|
||||
float *input_quant_max = GetDeviceAddress<float>(inputs, 2);
|
||||
float *output = GetDeviceAddress<float>(outputs, 0);
|
||||
float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
|
||||
float *input_quant = GetDeviceAddress<float>(workspace, 1);
|
||||
float *input = GetDeviceAddress<float>(inputs, kIndex0);
|
||||
float *input_alpha = GetDeviceAddress<float>(inputs, kIndex1);
|
||||
float *input_quant_max = GetDeviceAddress<float>(inputs, kIndex2);
|
||||
float *output = GetDeviceAddress<float>(outputs, kIndex0);
|
||||
float *input_div_alpha = GetDeviceAddress<float>(workspace, kIndex0);
|
||||
float *input_quant = GetDeviceAddress<float>(workspace, kIndex1);
|
||||
|
||||
MS_EXCEPTION_IF_NULL(input);
|
||||
MS_EXCEPTION_IF_NULL(input_alpha);
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
#include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perchannel_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -31,12 +32,12 @@ bool FakeLearnedScaleQuantPerChannelGradGpuKernelMod::Init(const CNodePtr &kerne
|
|||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
if (input_num != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 2) {
|
||||
if (output_num != kSize2) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 2, but got " << output_num;
|
||||
}
|
||||
|
||||
|
@ -50,7 +51,7 @@ bool FakeLearnedScaleQuantPerChannelGradGpuKernelMod::Init(const CNodePtr &kerne
|
|||
neg_trunc_ = GetValue<bool>(common::AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
|
||||
|
||||
// init size
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
|
||||
num_channels_ = LongToInt(input_shape[0]);
|
||||
for (size_t i = 0; i < input_shape.size(); ++i) {
|
||||
quant_num_ *= LongToInt(input_shape[i]);
|
||||
|
@ -74,14 +75,14 @@ void FakeLearnedScaleQuantPerChannelGradGpuKernelMod::InitSizeLists() {
|
|||
bool FakeLearnedScaleQuantPerChannelGradGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
float *grad_input = GetDeviceAddress<float>(outputs, 0);
|
||||
float *grad_alpha = GetDeviceAddress<float>(outputs, 1);
|
||||
float *gradient = GetDeviceAddress<float>(inputs, 0);
|
||||
float *input = GetDeviceAddress<float>(inputs, 1);
|
||||
float *input_alpha = GetDeviceAddress<float>(inputs, 2);
|
||||
float *input_quant_max = GetDeviceAddress<float>(inputs, 3);
|
||||
float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
|
||||
float *input_quant = GetDeviceAddress<float>(workspace, 1);
|
||||
float *grad_input = GetDeviceAddress<float>(outputs, kIndex0);
|
||||
float *grad_alpha = GetDeviceAddress<float>(outputs, kIndex1);
|
||||
float *gradient = GetDeviceAddress<float>(inputs, kIndex0);
|
||||
float *input = GetDeviceAddress<float>(inputs, kIndex1);
|
||||
float *input_alpha = GetDeviceAddress<float>(inputs, kIndex2);
|
||||
float *input_quant_max = GetDeviceAddress<float>(inputs, kIndex3);
|
||||
float *input_div_alpha = GetDeviceAddress<float>(workspace, kIndex0);
|
||||
float *input_quant = GetDeviceAddress<float>(workspace, kIndex1);
|
||||
|
||||
MS_EXCEPTION_IF_NULL(grad_input);
|
||||
MS_EXCEPTION_IF_NULL(grad_alpha);
|
||||
|
|
|
@ -15,11 +15,12 @@
|
|||
*/
|
||||
|
||||
#include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh"
|
||||
#include <thrust/extrema.h>
|
||||
#include <thrust/pair.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -30,12 +31,12 @@ bool FakeLearnedScaleQuantPerLayerGpuKernelMod::Init(const CNodePtr &kernel_node
|
|||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
if (input_num != kSize3) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
if (output_num != kSize1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
|
||||
|
@ -45,7 +46,7 @@ bool FakeLearnedScaleQuantPerLayerGpuKernelMod::Init(const CNodePtr &kernel_node
|
|||
neg_trunc_ = GetValue<bool>(common::AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
|
||||
|
||||
// init size
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
|
||||
for (size_t i = 0; i < input_shape.size(); ++i) {
|
||||
quant_num_ *= LongToInt(input_shape[i]);
|
||||
}
|
||||
|
@ -66,12 +67,12 @@ void FakeLearnedScaleQuantPerLayerGpuKernelMod::InitSizeLists() {
|
|||
bool FakeLearnedScaleQuantPerLayerGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
float *input = GetDeviceAddress<float>(inputs, 0);
|
||||
float *input_alpha = GetDeviceAddress<float>(inputs, 1);
|
||||
float *input_quant_max = GetDeviceAddress<float>(inputs, 2);
|
||||
float *output = GetDeviceAddress<float>(outputs, 0);
|
||||
float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
|
||||
float *input_quant = GetDeviceAddress<float>(workspace, 1);
|
||||
float *input = GetDeviceAddress<float>(inputs, kIndex0);
|
||||
float *input_alpha = GetDeviceAddress<float>(inputs, kIndex1);
|
||||
float *input_quant_max = GetDeviceAddress<float>(inputs, kIndex2);
|
||||
float *output = GetDeviceAddress<float>(outputs, kIndex0);
|
||||
float *input_div_alpha = GetDeviceAddress<float>(workspace, kIndex0);
|
||||
float *input_quant = GetDeviceAddress<float>(workspace, kIndex1);
|
||||
|
||||
MS_EXCEPTION_IF_NULL(input);
|
||||
MS_EXCEPTION_IF_NULL(input_alpha);
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
#include "plugin/device/gpu/kernel/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_learned_scale_quant_perlayer_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -25,12 +26,12 @@ bool FakeLearnedScaleQuantPerLayerGradGpuKernelMod::Init(const CNodePtr &kernel_
|
|||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
if (input_num != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 2) {
|
||||
if (output_num != kSize2) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 2, but got " << output_num;
|
||||
}
|
||||
|
||||
|
@ -44,7 +45,7 @@ bool FakeLearnedScaleQuantPerLayerGradGpuKernelMod::Init(const CNodePtr &kernel_
|
|||
neg_trunc_ = GetValue<bool>(common::AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
|
||||
|
||||
// init size
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
|
||||
auto size = SizeOf(input_shape);
|
||||
quant_num_ = SizeToInt(size);
|
||||
input_size_ = sizeof(float) * size;
|
||||
|
@ -66,14 +67,14 @@ void FakeLearnedScaleQuantPerLayerGradGpuKernelMod::InitSizeLists() {
|
|||
bool FakeLearnedScaleQuantPerLayerGradGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
float *grad_input = GetDeviceAddress<float>(outputs, 0);
|
||||
float *grad_alpha = GetDeviceAddress<float>(outputs, 1);
|
||||
float *gradient = GetDeviceAddress<float>(inputs, 0);
|
||||
float *input = GetDeviceAddress<float>(inputs, 1);
|
||||
float *input_alpha = GetDeviceAddress<float>(inputs, 2);
|
||||
float *input_quant_max = GetDeviceAddress<float>(inputs, 3);
|
||||
float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
|
||||
float *input_quant = GetDeviceAddress<float>(workspace, 1);
|
||||
float *grad_input = GetDeviceAddress<float>(outputs, kIndex0);
|
||||
float *grad_alpha = GetDeviceAddress<float>(outputs, kIndex1);
|
||||
float *gradient = GetDeviceAddress<float>(inputs, kIndex0);
|
||||
float *input = GetDeviceAddress<float>(inputs, kIndex1);
|
||||
float *input_alpha = GetDeviceAddress<float>(inputs, kIndex2);
|
||||
float *input_quant_max = GetDeviceAddress<float>(inputs, kIndex3);
|
||||
float *input_div_alpha = GetDeviceAddress<float>(workspace, kIndex0);
|
||||
float *input_quant = GetDeviceAddress<float>(workspace, kIndex1);
|
||||
|
||||
MS_EXCEPTION_IF_NULL(grad_input);
|
||||
MS_EXCEPTION_IF_NULL(grad_alpha);
|
||||
|
|
|
@ -15,11 +15,12 @@
|
|||
*/
|
||||
|
||||
#include "plugin/device/gpu/kernel/quant/fake_quant_perchannel_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh"
|
||||
#include <thrust/extrema.h>
|
||||
#include <thrust/pair.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -40,12 +41,12 @@ bool FakeQuantPerChannelGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
if (input_num != kSize3) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
if (output_num != kSize1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
|
||||
|
@ -57,9 +58,9 @@ bool FakeQuantPerChannelGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
symmetric_ = GetValue<bool>(prim->GetAttr("symmetric"));
|
||||
narrow_range_ = GetValue<bool>(prim->GetAttr("narrow_range"));
|
||||
quant_delay_ = static_cast<int>(GetValue<int64_t>(prim->GetAttr("quant_delay")));
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
|
||||
|
||||
if (num_bits_ <= 2 || num_bits_ >= 16) {
|
||||
if (num_bits_ <= kMinQuantBit || num_bits_ >= kMaxQuantBit) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of num_bits should be in (2, 16), but got "
|
||||
<< num_bits_;
|
||||
}
|
||||
|
@ -117,13 +118,13 @@ bool FakeQuantPerChannelGpuKernelMod::Launch(const std::vector<AddressPtr> &inpu
|
|||
return true;
|
||||
}
|
||||
(void)workspace;
|
||||
float *output = GetDeviceAddress<float>(outputs, 0);
|
||||
float *input = GetDeviceAddress<float>(inputs, 0);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, 1);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, 2);
|
||||
float *scale = GetDeviceAddress<float>(workspace, 0);
|
||||
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
|
||||
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
|
||||
float *output = GetDeviceAddress<float>(outputs, kIndex0);
|
||||
float *input = GetDeviceAddress<float>(inputs, kIndex0);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, kIndex1);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, kIndex2);
|
||||
float *scale = GetDeviceAddress<float>(workspace, kIndex0);
|
||||
float *nudge_min = GetDeviceAddress<float>(workspace, kIndex1);
|
||||
float *nudge_max = GetDeviceAddress<float>(workspace, kIndex2);
|
||||
|
||||
if (training_) {
|
||||
if (global_step_ >= quant_delay_) {
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "plugin/device/gpu/kernel/quant/fake_quant_perchannel_grad_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perchannel_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -35,19 +36,19 @@ bool FakeQuantPerChannelGradGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
if (input_num != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
if (output_num != kSize1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
|
||||
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(prim);
|
||||
num_bits_ = static_cast<unsigned int>(GetValue<int64_t>(prim->GetAttr("num_bits")));
|
||||
if (num_bits_ <= 2 || num_bits_ >= 16) {
|
||||
if (num_bits_ <= kMinQuantBit || num_bits_ >= kMaxQuantBit) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of num_bits should be in (2, 16), but got "
|
||||
<< num_bits_;
|
||||
}
|
||||
|
@ -68,7 +69,7 @@ bool FakeQuantPerChannelGradGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
quant_min_++;
|
||||
}
|
||||
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
|
@ -101,14 +102,14 @@ bool FakeQuantPerChannelGradGpuKernelMod::Launch(const std::vector<AddressPtr> &
|
|||
return true;
|
||||
}
|
||||
(void)workspace;
|
||||
float *output = GetDeviceAddress<float>(outputs, 0);
|
||||
float *gradient = GetDeviceAddress<float>(inputs, 0);
|
||||
float *input = GetDeviceAddress<float>(inputs, 1);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, 2);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, 3);
|
||||
float *scale = GetDeviceAddress<float>(workspace, 0);
|
||||
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
|
||||
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
|
||||
float *output = GetDeviceAddress<float>(outputs, kIndex0);
|
||||
float *gradient = GetDeviceAddress<float>(inputs, kIndex0);
|
||||
float *input = GetDeviceAddress<float>(inputs, kIndex1);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, kIndex2);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, kIndex3);
|
||||
float *scale = GetDeviceAddress<float>(workspace, kIndex0);
|
||||
float *nudge_min = GetDeviceAddress<float>(workspace, kIndex1);
|
||||
float *nudge_max = GetDeviceAddress<float>(workspace, kIndex2);
|
||||
|
||||
int total_size = input_size_ / sizeof(float);
|
||||
if (global_step_ >= quant_delay_) {
|
||||
|
|
|
@ -15,11 +15,12 @@
|
|||
*/
|
||||
|
||||
#include "plugin/device/gpu/kernel/quant/fake_quant_perlayer_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh"
|
||||
#include <thrust/extrema.h>
|
||||
#include <thrust/pair.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -40,12 +41,12 @@ bool FakeQuantPerLayerGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
if (input_num != kSize3) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
if (output_num != kSize1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
|
||||
|
@ -57,7 +58,7 @@ bool FakeQuantPerLayerGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
symmetric_ = GetValue<bool>(prim->GetAttr("symmetric"));
|
||||
narrow_range_ = GetValue<bool>(prim->GetAttr("narrow_range"));
|
||||
|
||||
if (num_bits_ <= 2 || num_bits_ >= 16) {
|
||||
if (num_bits_ <= kMinQuantBit || num_bits_ >= kMaxQuantBit) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of num_bits should be in (2, 16), but got "
|
||||
<< num_bits_;
|
||||
}
|
||||
|
@ -74,7 +75,7 @@ bool FakeQuantPerLayerGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
}
|
||||
|
||||
// init size
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
|
@ -103,13 +104,13 @@ bool FakeQuantPerLayerGpuKernelMod::Launch(const std::vector<AddressPtr> &inputs
|
|||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
float *output = GetDeviceAddress<float>(outputs, 0);
|
||||
float *input = GetDeviceAddress<float>(inputs, 0);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, 1);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, 2);
|
||||
float *scale = GetDeviceAddress<float>(workspace, 0);
|
||||
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
|
||||
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
|
||||
float *output = GetDeviceAddress<float>(outputs, kIndex0);
|
||||
float *input = GetDeviceAddress<float>(inputs, kIndex0);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, kIndex1);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, kIndex2);
|
||||
float *scale = GetDeviceAddress<float>(workspace, kIndex0);
|
||||
float *nudge_min = GetDeviceAddress<float>(workspace, kIndex1);
|
||||
float *nudge_max = GetDeviceAddress<float>(workspace, kIndex2);
|
||||
|
||||
if (training_) {
|
||||
// control flow for quant_delay
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "plugin/device/gpu/kernel/quant/fake_quant_perlayer_grad_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/fake_quant_perlayer_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -36,19 +37,19 @@ bool FakeQuantPerLayerGradGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
if (input_num != kSize4) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
if (output_num != kSize1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
|
||||
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(prim);
|
||||
num_bits_ = static_cast<unsigned int>(GetValue<int64_t>(prim->GetAttr("num_bits")));
|
||||
if (num_bits_ <= 2 || num_bits_ >= 16) {
|
||||
if (num_bits_ <= kMinQuantBit || num_bits_ >= kMaxQuantBit) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of num_bits should be in (2, 16), but got "
|
||||
<< num_bits_;
|
||||
}
|
||||
|
@ -70,7 +71,7 @@ bool FakeQuantPerLayerGradGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
}
|
||||
|
||||
// init size
|
||||
auto input_shape = Convert2SizeT(common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0));
|
||||
auto input_shape = Convert2SizeT(common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0));
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
|
@ -104,14 +105,14 @@ bool FakeQuantPerLayerGradGpuKernelMod::Launch(const std::vector<AddressPtr> &in
|
|||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
float *output = GetDeviceAddress<float>(outputs, 0);
|
||||
float *gradient = GetDeviceAddress<float>(inputs, 0);
|
||||
float *input = GetDeviceAddress<float>(inputs, 1);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, 2);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, 3);
|
||||
float *scale = GetDeviceAddress<float>(workspace, 0);
|
||||
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
|
||||
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
|
||||
float *output = GetDeviceAddress<float>(outputs, kIndex0);
|
||||
float *gradient = GetDeviceAddress<float>(inputs, kIndex0);
|
||||
float *input = GetDeviceAddress<float>(inputs, kIndex1);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, kIndex2);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, kIndex3);
|
||||
float *scale = GetDeviceAddress<float>(workspace, kIndex0);
|
||||
float *nudge_min = GetDeviceAddress<float>(workspace, kIndex1);
|
||||
float *nudge_max = GetDeviceAddress<float>(workspace, kIndex2);
|
||||
|
||||
if (global_step_ >= quant_delay_) {
|
||||
CalNudgePerLayer(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale, symmetric_,
|
||||
|
|
|
@ -15,11 +15,12 @@
|
|||
*/
|
||||
|
||||
#include "plugin/device/gpu/kernel/quant/minmax_update_perchannel_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh"
|
||||
#include <thrust/extrema.h>
|
||||
#include <thrust/pair.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -30,12 +31,12 @@ bool MinMaxUpdatePerChannelGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
if (input_num != 3) {
|
||||
if (input_num != kSize3) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 2) {
|
||||
if (output_num != kSize2) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 2, but got " << output_num;
|
||||
}
|
||||
|
||||
|
@ -45,7 +46,7 @@ bool MinMaxUpdatePerChannelGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
ema_decay_ = GetValue<float>(prim->GetAttr("ema_decay"));
|
||||
|
||||
// init size
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
|
@ -75,11 +76,11 @@ bool MinMaxUpdatePerChannelGpuKernelMod::Launch(const std::vector<AddressPtr> &i
|
|||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
float *output_min = GetDeviceAddress<float>(outputs, 0);
|
||||
float *output_max = GetDeviceAddress<float>(outputs, 1);
|
||||
float *input = GetDeviceAddress<float>(inputs, 0);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, 1);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, 2);
|
||||
float *output_min = GetDeviceAddress<float>(outputs, kIndex0);
|
||||
float *output_max = GetDeviceAddress<float>(outputs, kIndex1);
|
||||
float *input = GetDeviceAddress<float>(inputs, kIndex0);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, kIndex1);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, kIndex2);
|
||||
|
||||
// calculate the input min and max according by the parameter ema and ema_decay.
|
||||
CalMinMaxPerChannel(input, input_min, input_max, output_min, output_max, input_size_ / sizeof(float), num_channels_,
|
||||
|
|
|
@ -15,11 +15,12 @@
|
|||
*/
|
||||
|
||||
#include "plugin/device/gpu/kernel/quant/minmax_update_perlayer_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh"
|
||||
#include <thrust/extrema.h>
|
||||
#include <thrust/pair.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/minmax_update_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/quant/quant_op_const.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -30,12 +31,12 @@ bool MinMaxUpdatePerLayerGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
if (input_num != 3) {
|
||||
if (input_num != kSize3) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 2) {
|
||||
if (output_num != kSize2) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 2, but got " << output_num;
|
||||
}
|
||||
|
||||
|
@ -45,7 +46,7 @@ bool MinMaxUpdatePerLayerGpuKernelMod::Init(const CNodePtr &kernel_node) {
|
|||
ema_decay_ = GetValue<float>(prim->GetAttr("ema_decay"));
|
||||
|
||||
// init size
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kIndex0);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
|
@ -71,11 +72,11 @@ bool MinMaxUpdatePerLayerGpuKernelMod::Launch(const std::vector<AddressPtr> &inp
|
|||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
float *output_min = GetDeviceAddress<float>(outputs, 0);
|
||||
float *output_max = GetDeviceAddress<float>(outputs, 1);
|
||||
float *input = GetDeviceAddress<float>(inputs, 0);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, 1);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, 2);
|
||||
float *output_min = GetDeviceAddress<float>(outputs, kIndex0);
|
||||
float *output_max = GetDeviceAddress<float>(outputs, kIndex1);
|
||||
float *input = GetDeviceAddress<float>(inputs, kIndex0);
|
||||
float *input_min = GetDeviceAddress<float>(inputs, kIndex1);
|
||||
float *input_max = GetDeviceAddress<float>(inputs, kIndex2);
|
||||
|
||||
CalMinMaxPerLayer(input, input_min, input_max, output_min, output_max, quant_num_, ema_decay_, ema_,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
/**
|
||||
* Copyright 2023 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_QUANT_QUANT_OP_CONST_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_QUANT_QUANT_OP_CONST_H_
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr int kIndex0 = 0;
|
||||
constexpr int kIndex1 = 1;
|
||||
constexpr int kIndex2 = 2;
|
||||
constexpr int kIndex3 = 3;
|
||||
constexpr int kIndex4 = 4;
|
||||
constexpr int kIndex5 = 5;
|
||||
constexpr int kIndex6 = 6;
|
||||
constexpr int kIndex7 = 7;
|
||||
|
||||
constexpr int kSize0 = 0;
|
||||
constexpr int kSize1 = 1;
|
||||
constexpr int kSize2 = 2;
|
||||
constexpr int kSize3 = 3;
|
||||
constexpr int kSize4 = 4;
|
||||
constexpr int kSize5 = 5;
|
||||
constexpr int kSize6 = 6;
|
||||
constexpr int kSize7 = 7;
|
||||
|
||||
constexpr int kMinQuantBit = 2;
|
||||
constexpr int kMaxQuantBit = 16;
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_QUANT_QUANT_OP_CONST_H_
|
Loading…
Reference in New Issue