forked from mindspore-Ecosystem/mindspore
!363 clear the warmming scan by package
Merge pull request !363 from SanjayChan/labao
This commit is contained in:
commit
58b013c319
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
|
|
|
@ -19,7 +19,6 @@
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
|
|
||||||
DropoutGpuFwdKernel::DropoutGpuFwdKernel()
|
DropoutGpuFwdKernel::DropoutGpuFwdKernel()
|
||||||
: cudnn_handle_(nullptr),
|
: cudnn_handle_(nullptr),
|
||||||
is_null_input_(false),
|
is_null_input_(false),
|
||||||
|
|
|
@ -18,7 +18,6 @@
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
|
|
||||||
MS_REG_GPU_KERNEL_ONE(BatchNormFold2,
|
MS_REG_GPU_KERNEL_ONE(BatchNormFold2,
|
||||||
KernelAttr()
|
KernelAttr()
|
||||||
.AddInputAttr(kNumberTypeFloat32)
|
.AddInputAttr(kNumberTypeFloat32)
|
||||||
|
|
|
@ -132,7 +132,6 @@ class BatchNormFold2GpuKernel : public GpuKernel {
|
||||||
std::vector<size_t> output_size_list_;
|
std::vector<size_t> output_size_list_;
|
||||||
std::vector<size_t> workspace_size_list_;
|
std::vector<size_t> workspace_size_list_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,6 @@
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
|
|
||||||
MS_REG_GPU_KERNEL_ONE(BatchNormFold2Grad,
|
MS_REG_GPU_KERNEL_ONE(BatchNormFold2Grad,
|
||||||
KernelAttr()
|
KernelAttr()
|
||||||
.AddInputAttr(kNumberTypeFloat32)
|
.AddInputAttr(kNumberTypeFloat32)
|
||||||
|
|
|
@ -18,7 +18,6 @@
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
|
|
||||||
MS_REG_GPU_KERNEL_ONE(BatchNormFold,
|
MS_REG_GPU_KERNEL_ONE(BatchNormFold,
|
||||||
KernelAttr()
|
KernelAttr()
|
||||||
.AddInputAttr(kNumberTypeFloat32)
|
.AddInputAttr(kNumberTypeFloat32)
|
||||||
|
|
|
@ -54,7 +54,6 @@ class CorrectionMulGpuKernel : public GpuKernel {
|
||||||
}
|
}
|
||||||
|
|
||||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||||
|
|
||||||
if (input_shape.size() != 4) {
|
if (input_shape.size() != 4) {
|
||||||
MS_LOG(ERROR) << "CorrectionMulGpuKernel input shape needs (N,C,H,W).";
|
MS_LOG(ERROR) << "CorrectionMulGpuKernel input shape needs (N,C,H,W).";
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -19,7 +19,6 @@
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
|
|
||||||
MS_REG_GPU_KERNEL_ONE(CorrectionMulGrad,
|
MS_REG_GPU_KERNEL_ONE(CorrectionMulGrad,
|
||||||
KernelAttr()
|
KernelAttr()
|
||||||
.AddInputAttr(kNumberTypeFloat32)
|
.AddInputAttr(kNumberTypeFloat32)
|
||||||
|
|
|
@ -61,7 +61,6 @@ class CorrectionMulGradGpuKernel : public GpuKernel {
|
||||||
}
|
}
|
||||||
|
|
||||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||||
|
|
||||||
if (input_shape.size() != 4) {
|
if (input_shape.size() != 4) {
|
||||||
MS_LOG(ERROR) << "CorrectionMulGradGpuKernel input shape needs (N,C,H,W).";
|
MS_LOG(ERROR) << "CorrectionMulGradGpuKernel input shape needs (N,C,H,W).";
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -114,6 +114,36 @@ void FakeQuantPerChannelGpuKernel::InitSizeLists() {
|
||||||
workspace_size_list_.push_back(workspace_size_);
|
workspace_size_list_.push_back(workspace_size_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void FakeQuantPerChannelGpuKernel::CalFakeQuantizeForTraining(float *input, float *output, float *input_min,
|
||||||
|
float *input_max, float *d_nudge_min, float *d_nudge_max,
|
||||||
|
float *d_scale, uintptr_t stream_ptr) {
|
||||||
|
// calculate the input min and max according by the parameter ema and ema_decay.
|
||||||
|
CalMinMaxPerChannel(input, input_min, input_max, input_size_ / sizeof(float), channel_out_, ema_decay_, ema_,
|
||||||
|
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
|
// control flow for quant_delay
|
||||||
|
if (global_step_ >= quant_delay_) {
|
||||||
|
// real launch
|
||||||
|
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
|
||||||
|
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
|
CalFakeQuantizePerChannel(input, output, input_size_ / sizeof(float), channel_out_, d_nudge_min, d_nudge_max,
|
||||||
|
d_scale, symmetric_, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
|
} else {
|
||||||
|
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpy(output, input, input_size_, cudaMemcpyDeviceToDevice),
|
||||||
|
"Copy gpu memory failed.");
|
||||||
|
}
|
||||||
|
global_step_++;
|
||||||
|
}
|
||||||
|
|
||||||
|
void FakeQuantPerChannelGpuKernel::CalFakeQuantizeForInfer(float *input, float *output, float *input_min,
|
||||||
|
float *input_max, float *d_nudge_min, float *d_nudge_max,
|
||||||
|
float *d_scale, uintptr_t stream_ptr) {
|
||||||
|
// real launch
|
||||||
|
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
|
||||||
|
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
|
CalFakeQuantizePerChannel(input, output, input_size_ / sizeof(float), channel_out_, d_nudge_min, d_nudge_max, d_scale,
|
||||||
|
symmetric_, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
|
}
|
||||||
|
|
||||||
bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
|
bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
|
||||||
const std::vector<AddressPtr> &workspace,
|
const std::vector<AddressPtr> &workspace,
|
||||||
const std::vector<AddressPtr> &outputs, uintptr_t stream_ptr) {
|
const std::vector<AddressPtr> &outputs, uintptr_t stream_ptr) {
|
||||||
|
@ -126,11 +156,8 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
|
||||||
if (input == nullptr) {
|
if (input == nullptr) {
|
||||||
MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input is null.";
|
MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input is null.";
|
||||||
}
|
}
|
||||||
if (input_min == nullptr) {
|
if (input_min == nullptr || input_max == nullptr) {
|
||||||
MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input min is null.";
|
MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input min or max is null.";
|
||||||
}
|
|
||||||
if (input_max == nullptr) {
|
|
||||||
MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input max is null.";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Allocate space for device copies
|
// Allocate space for device copies
|
||||||
|
@ -143,30 +170,11 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
|
||||||
"Malloc gpu memory failed");
|
"Malloc gpu memory failed");
|
||||||
CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_max), sizeof(float) * channel_out_),
|
CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_max), sizeof(float) * channel_out_),
|
||||||
"Malloc gpu memory failed");
|
"Malloc gpu memory failed");
|
||||||
int total_size = input_size_ / sizeof(float);
|
|
||||||
bool symmetric = false;
|
|
||||||
if (training_) {
|
if (training_) {
|
||||||
// calculate the input min and max according by the parameter ema and ema_decay.
|
CalFakeQuantizeForTraining(input, output, input_min, input_max, d_nudge_min, d_nudge_max, d_scale, stream_ptr);
|
||||||
CalMinMaxPerChannel(input, input_min, input_max, total_size, channel_out_, ema_decay_, ema_,
|
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
||||||
// control flow for quant_delay
|
|
||||||
if (global_step_ >= quant_delay_) {
|
|
||||||
// real launch
|
|
||||||
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
|
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
||||||
CalFakeQuantizePerChannel(input, output, total_size, channel_out_, d_nudge_min, d_nudge_max, d_scale, symmetric,
|
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
||||||
} else {
|
} else {
|
||||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpy(output, input, input_size_, cudaMemcpyDeviceToDevice),
|
CalFakeQuantizeForInfer(input, output, input_min, input_max, d_nudge_min, d_nudge_max, d_scale, stream_ptr);
|
||||||
"Copy gpu memory failed.");
|
|
||||||
}
|
|
||||||
global_step_++;
|
|
||||||
} else {
|
|
||||||
// real launch
|
|
||||||
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
|
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
||||||
CalFakeQuantizePerChannel(input, output, total_size, channel_out_, d_nudge_min, d_nudge_max, d_scale, symmetric,
|
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cleanup
|
// Cleanup
|
||||||
|
|
|
@ -39,6 +39,11 @@ class FakeQuantPerChannelGpuKernel : public GpuKernel {
|
||||||
void InitSizeLists() override;
|
void InitSizeLists() override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void CalFakeQuantizeForTraining(float *input, float *output, float *input_min, float *input_max, float *d_nudge_min,
|
||||||
|
float *d_nudge_max, float *d_scale, uintptr_t stream_ptr);
|
||||||
|
void CalFakeQuantizeForInfer(float *input, float *output, float *input_min, float *input_max, float *d_nudge_min,
|
||||||
|
float *d_nudge_max, float *d_scale, uintptr_t stream_ptr);
|
||||||
|
|
||||||
size_t input_size_;
|
size_t input_size_;
|
||||||
size_t min_size_;
|
size_t min_size_;
|
||||||
size_t max_size_;
|
size_t max_size_;
|
||||||
|
|
Loading…
Reference in New Issue