add lsq quantization method

2021-04-17 09:49:16 +08:00 · 2021-04-17 09:49:16 +08:00 · 90d5d5dab3
parent c04e8fce4d
commit 90d5d5dab3
42 changed files with 4157 additions and 267 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perchannel_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perchannel_impl.cu
@ -0,0 +1,113 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fake_learned_scale_quant_perchannel_impl.cuh"
+#include <thrust/extrema.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+#include <thrust/pair.h>
+#include <algorithm>
+#include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"
+
+__global__ void FakeLearnedScaleQuantPerChannel(float *output, const int size, float *input_alpha,
+                                                float *input_quant, const int channel_num) {
+  int channel_idx = 0;
+  int per_channel_num = size / channel_num;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+    channel_idx = floor(static_cast<double>(i) / static_cast<double>(per_channel_num));
+    // dequantize
+    output[i] = input_quant[i] * input_alpha[channel_idx];
+  }
+  return;
+}
+
+__global__ void FakeLearnedScaleQuantPerChannelGrad(float *grad_input, float *grad_alpha, const float *gradient,
+                                                    const int size, const float *input_div_alpha,
+                                                    const float *input_quant, const bool neg_trunc,
+                                                    const int channel_num) {
+  int channel_idx = 0;
+  int per_channel_num = size / channel_num;
+  float lower_bound = -1.0 * !neg_trunc;
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+    float grad_alpha_temp = 0.f;
+    channel_idx = floor(static_cast<double>(i) / static_cast<double>(per_channel_num));
+    if (input_div_alpha[i] > 1.0) {
+      grad_alpha_temp = gradient[i];
+      grad_input[i] = 0;
+    } else if (input_div_alpha[i] < lower_bound) {
+      grad_alpha_temp = -gradient[i];
+      grad_input[i] = 0;
+    } else {
+      grad_input[i] = gradient[i];
+      grad_alpha_temp = (gradient[i] * (input_quant[i] -  input_div_alpha[i]));
+    }
+    MsAtomicAdd(grad_alpha + channel_idx, grad_alpha_temp);
+  }
+  return;
+}
+
+__global__ void LSQNudgePerChannel(const float *input, const int size, float *input_alpha, float *input_quant_max,
+                                   float *input_div_alpha, float *input_quant, const bool neg_trunc,
+                                   const int channel_num) {
+  float input_x;
+  int channel_idx = 0;
+  int per_channel_num = size / channel_num;
+  float lower_bound = -1.0 * !neg_trunc;
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+    channel_idx = floor(static_cast<double>(i) / static_cast<double>(per_channel_num));
+    input_x = input[i] / input_alpha[channel_idx];
+    input_div_alpha[i] = input_x;
+    input_x = max(input_x, lower_bound);
+    input_x = min(input_x, 1.0);
+
+    // quantize
+    input_quant[i] = floor(input_x * input_quant_max[0] + 0.5f) / input_quant_max[0];
+  }
+  return;
+}
+
+void CalFakeLearnedScaleQuantPerChannel(float *output, const int size, float *input_alpha, float *input_quant,
+                                        const int channel_num, cudaStream_t cuda_stream) {
+  FakeLearnedScaleQuantPerChannel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(output, size, input_alpha,
+                                                                                     input_quant, channel_num);
+  return;
+}
+
+void CalFakeLearnedScaleQuantPerChannelGrad(float *grad_input, float *grad_alpha, const float *gradient, const int size,
+                                            const float *input_div_alpha, const float *input_quant,
+                                            const bool neg_trunc, const int channel_num, cudaStream_t cuda_stream) {
+  FakeLearnedScaleQuantPerChannelGrad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(grad_input,
+                                                                                         grad_alpha,
+                                                                                         gradient,
+                                                                                         size,
+                                                                                         input_div_alpha,
+                                                                                         input_quant,
+                                                                                         neg_trunc,
+                                                                                         channel_num);
+  return;
+}
+
+void CalLSQNudgePerChannel(const float *input, const int size, float *input_alpha, float *input_quant_max,
+                           float *input_div_alpha, float *input_quant, const bool neg_trunc, const int channel_num,
+                           cudaStream_t cuda_stream) {
+  LSQNudgePerChannel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(input, size, input_alpha, input_quant_max,
+                                                                        input_div_alpha, input_quant, neg_trunc,
+                                                                        channel_num);
+  return;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh
@ -0,0 +1,33 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_H_
+
+#include "runtime/device/gpu/cuda_common.h"
+
+void CalLSQNudgePerChannel(const float *input, const int size, float *input_alpha, float *input_quant_max,
+                           float *input_div_alpha, float *input_quant, const bool neg_trunc, const int channel_num,
+                           cudaStream_t cuda_stream);
+
+void CalFakeLearnedScaleQuantPerChannel(float *output, const int size, float *input_alpha, float *input_quant,
+                                        const int channel_num, cudaStream_t cuda_stream);
+
+void CalFakeLearnedScaleQuantPerChannelGrad(float *grad_input, float *grad_alpha, const float *gradient, const int size,
+                                            const float *input_div_alpha, const float *input_quant,
+                                            const bool neg_trunc, const int channel_num, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERCHANNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perlayer_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perlayer_impl.cu
@ -0,0 +1,97 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fake_learned_scale_quant_perlayer_impl.cuh"
+#include <thrust/extrema.h>
+#include <thrust/device_vector.h>
+#include <thrust/pair.h>
+#include <algorithm>
+#include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"
+
+__global__ void FakeLearnedScaleQuantPerLayer(float *output, const int size, float *input_alpha,
+                                              float *input_quant) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+    // dequantize
+    output[i] = input_quant[i] * input_alpha[0];
+  }
+  return;
+}
+
+__global__ void FakeLearnedScaleQuantPerLayerGrad(float *grad_input, float *grad_alpha, const float *gradient,
+                                                  const int size, const float *input_div_alpha,
+                                                  const float *input_quant, const bool neg_trunc) {
+  float grad_alpha_temp = 0.f;
+  float lower_bound = -1.0 * !neg_trunc;
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+    if (input_div_alpha[i] > 1.0) {
+      grad_alpha_temp += gradient[i];
+      grad_input[i] = 0;
+    } else if (input_div_alpha[i] < lower_bound) {
+      grad_alpha_temp -= gradient[i];
+      grad_input[i] = 0;
+    } else {
+      grad_input[i] = gradient[i];
+      grad_alpha_temp += (gradient[i] * (input_quant[i] -  input_div_alpha[i]));
+    }
+  }
+  MsAtomicAdd(grad_alpha, grad_alpha_temp);
+  return;
+}
+
+__global__ void LSQNudgePerLayer(const float *input, const int size, float *input_alpha, float *input_quant_max,
+                                 float *input_div_alpha, float *input_quant, const bool neg_trunc) {
+  float input_x;
+  float lower_bound = -1.0 * !neg_trunc;
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+    input_x = input[i] / input_alpha[0];
+    input_div_alpha[i] = input_x;
+    input_x = max(input_x, lower_bound);
+    input_x = min(input_x, 1.0);
+
+    // quantize
+    input_quant[i] = floor(input_x * input_quant_max[0] + 0.5f) / input_quant_max[0];
+  }
+  return;
+}
+
+void CalFakeLearnedScaleQuantPerLayer(float *output, const int size, float *input_alpha, float *input_quant,
+                                      cudaStream_t cuda_stream) {
+  FakeLearnedScaleQuantPerLayer<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(output, size, input_alpha,
+                                                                                   input_quant);
+  return;
+}
+
+void CalFakeLearnedScaleQuantPerLayerGrad(float *grad_input, float *grad_alpha, const float *gradient, const int size,
+                                          const float *input_div_alpha, const float *input_quant, const bool neg_trunc,
+                                          cudaStream_t cuda_stream) {
+  FakeLearnedScaleQuantPerLayerGrad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(grad_input,
+                                                                                       grad_alpha,
+                                                                                       gradient,
+                                                                                       size,
+                                                                                       input_div_alpha,
+                                                                                       input_quant,
+                                                                                       neg_trunc);
+  return;
+}
+
+void CalLSQNudgePerLayer(const float *input, const int size, float *input_alpha, float *input_quant_max,
+                         float *input_div_alpha, float *input_quant, const bool neg_trunc, cudaStream_t cuda_stream) {
+  LSQNudgePerLayer<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(input, size, input_alpha, input_quant_max,
+                                                                      input_div_alpha, input_quant, neg_trunc);
+  return;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh
@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERLAYER_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERLAYER_H_
+
+#include "runtime/device/gpu/cuda_common.h"
+
+void CalLSQNudgePerLayer(const float *input, const int size, float *input_alpha, float *input_quant_max,
+                         float *input_div_alpha, float *input_quant, const bool neg_trunc, cudaStream_t cuda_stream);
+
+void CalFakeLearnedScaleQuantPerLayer(float *output, const int size, float *input_alpha, float *input_quant,
+                                      cudaStream_t cuda_stream);
+
+void CalFakeLearnedScaleQuantPerLayerGrad(float *grad_input, float *grad_alpha, const float *gradient, const int size,
+                                          const float *input_div_alpha, const float *input_quant, const bool neg_trunc,
+                                          cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_LEARNED_SCALE_QUANT_PERLAYER_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc
@ -0,0 +1,130 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh"
+#include <thrust/extrema.h>
+#include <thrust/pair.h>
+#include <thrust/device_vector.h>
+#include <cuda_runtime_api.h>
+
+namespace mindspore {
+namespace kernel {
+FakeLearnedScaleQuantPerChannelGpuKernel::FakeLearnedScaleQuantPerChannelGpuKernel()
+    : input_size_(0),
+      quant_num_(1),
+      global_step_(0),
+      quant_delay_(0),
+      training_(false),
+      neg_trunc_(false),
+      num_channels_(0) {}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerChannelGpuKernel::GetInputSizeList() const {
+  return input_size_list_;
+}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerChannelGpuKernel::GetOutputSizeList() const {
+  return output_size_list_;
+}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerChannelGpuKernel::GetWorkspaceSizeList() const {
+  return workspace_size_list_;
+}
+
+bool FakeLearnedScaleQuantPerChannelGpuKernel::Init(const CNodePtr &kernel_node) {
+  kernel_node_ = kernel_node;
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 3) {
+    MS_LOG(EXCEPTION) << "Input number is " << input_num
+                      << ", but FakeLearnedScaleQuantPerChannel GpuKernel OP needs 3 Input.";
+  }
+
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 1) {
+    MS_LOG(EXCEPTION) << "Output number is " << output_num
+                      << ", but FakeLearnedScaleQuantPerChannel GpuKernel OP needs 1 output.";
+  }
+
+  quant_delay_ = static_cast<int>(GetValue<int64_t>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay")));
+  training_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("training"));
+  neg_trunc_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
+
+  // init size
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  num_channels_ = SizeToInt(input_shape[0]);
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    quant_num_ *= SizeToInt(input_shape[i]);
+  }
+  input_size_ = sizeof(float) * quant_num_;
+  InitSizeLists();
+  return true;
+}
+
+void FakeLearnedScaleQuantPerChannelGpuKernel::InitSizeLists() {
+  input_size_list_.push_back(input_size_);                    // x
+  input_size_list_.push_back(sizeof(float) * num_channels_);  // alpha
+  input_size_list_.push_back(sizeof(float));                  // quant_max
+  output_size_list_.push_back(input_size_);                   // y
+  workspace_size_list_.push_back(input_size_);                // input_div_alpha
+  workspace_size_list_.push_back(input_size_);                // input_quant
+}
+
+bool FakeLearnedScaleQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
+                                                      const std::vector<AddressPtr> &workspace,
+                                                      const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  float *input = GetDeviceAddress<float>(inputs, 0);
+  float *input_alpha = GetDeviceAddress<float>(inputs, 1);
+  float *input_quant_max = GetDeviceAddress<float>(inputs, 2);
+  float *output = GetDeviceAddress<float>(outputs, 0);
+  float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
+  float *input_quant = GetDeviceAddress<float>(workspace, 1);
+
+  MS_EXCEPTION_IF_NULL(input);
+  MS_EXCEPTION_IF_NULL(input_alpha);
+  MS_EXCEPTION_IF_NULL(input_quant_max);
+  MS_EXCEPTION_IF_NULL(output);
+  MS_EXCEPTION_IF_NULL(input_div_alpha);
+  MS_EXCEPTION_IF_NULL(input_quant);
+
+  if (training_) {
+    // control flow for quant_delay
+    if (global_step_ >= quant_delay_) {
+      // real launch
+      CalLSQNudgePerChannel(input, quant_num_, input_alpha, input_quant_max, input_div_alpha, input_quant, neg_trunc_,
+                            num_channels_, reinterpret_cast<cudaStream_t>(stream_ptr));
+      CalFakeLearnedScaleQuantPerChannel(output, quant_num_, input_alpha, input_quant, num_channels_,
+                                         reinterpret_cast<cudaStream_t>(stream_ptr));
+    } else {
+      CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
+                                cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
+                                                reinterpret_cast<cudaStream_t>(stream_ptr)),
+                                "Copy gpu memory failed");
+    }
+    global_step_++;
+  } else {
+    // real launch
+    CalLSQNudgePerChannel(input, quant_num_, input_alpha, input_quant_max, input_div_alpha, input_quant, neg_trunc_,
+                          num_channels_, reinterpret_cast<cudaStream_t>(stream_ptr));
+    CalFakeLearnedScaleQuantPerChannel(output, quant_num_, input_alpha, input_quant, num_channels_,
+                                       reinterpret_cast<cudaStream_t>(stream_ptr));
+  }
+
+  return true;
+}
+
+MS_REG_GPU_KERNEL(FakeLearnedScaleQuantPerChannel, FakeLearnedScaleQuantPerChannelGpuKernel)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_gpu_kernel.h
@ -0,0 +1,57 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PER_CHANNEL_GPUKERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PER_CHANNEL_GPUKERNEL_H_
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class FakeLearnedScaleQuantPerChannelGpuKernel : public GpuKernel {
+ public:
+  FakeLearnedScaleQuantPerChannelGpuKernel();
+  ~FakeLearnedScaleQuantPerChannelGpuKernel() = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override;
+  const std::vector<size_t> &GetOutputSizeList() const override;
+  const std::vector<size_t> &GetWorkspaceSizeList() const override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+  bool Init(const CNodePtr &kernel) override;
+
+ protected:
+  void InitSizeLists() override;
+
+ private:
+  size_t input_size_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+
+  int quant_num_;
+  int global_step_;
+  int quant_delay_;
+  bool training_;
+  bool neg_trunc_;
+  int num_channels_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PER_CHANNEL_GPUKERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.cc
@ -0,0 +1,135 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perchannel_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+FakeLearnedScaleQuantPerChannelGradGpuKernel::FakeLearnedScaleQuantPerChannelGradGpuKernel()
+    : input_size_(0),
+      workspace_size_(0),
+      quant_num_(1),
+      quant_delay_(0),
+      global_step_(0),
+      neg_trunc_(false),
+      num_channels_(0) {}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerChannelGradGpuKernel::GetInputSizeList() const {
+  return input_size_list_;
+}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerChannelGradGpuKernel::GetOutputSizeList() const {
+  return output_size_list_;
+}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerChannelGradGpuKernel::GetWorkspaceSizeList() const {
+  return workspace_size_list_;
+}
+
+bool FakeLearnedScaleQuantPerChannelGradGpuKernel::Init(const CNodePtr &kernel_node) {
+  kernel_node_ = kernel_node;
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+
+  if (input_num != 4) {
+    MS_LOG(EXCEPTION) << "Input number is " << input_num
+                      << ", but FakeLearnedScaleQuantPerChannelGrad GpuKernel OP needs 4 input.";
+  }
+
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 2) {
+    MS_LOG(EXCEPTION) << "Output number is " << output_num
+                      << ", but FakeLearnedScaleQuantPerChannelGrad GpuKernel OP needs 2 output.";
+  }
+
+  quant_delay_ = static_cast<int>(GetValue<int64_t>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay")));
+  if (quant_delay_ < 0) {
+    MS_LOG(EXCEPTION) << "Attr \'quant_delay_\' " << quant_delay_ << " is less than 0, require larger than 0.";
+  }
+
+  neg_trunc_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
+
+  // init size
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  num_channels_ = SizeToInt(input_shape[0]);
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    quant_num_ *= SizeToInt(input_shape[i]);
+  }
+  input_size_ = sizeof(float) * quant_num_;
+  InitSizeLists();
+  return true;
+}
+
+void FakeLearnedScaleQuantPerChannelGradGpuKernel::InitSizeLists() {
+  input_size_list_.push_back(input_size_);                     // gradient
+  input_size_list_.push_back(input_size_);                     // input
+  input_size_list_.push_back(sizeof(float) * num_channels_);   // alpha
+  input_size_list_.push_back(sizeof(float));                   // quant_max
+  output_size_list_.push_back(input_size_);                    //  grad_input
+  output_size_list_.push_back(sizeof(float) * num_channels_);  // grad_alpha
+  workspace_size_list_.push_back(input_size_);                 // input_div_alpha
+  workspace_size_list_.push_back(input_size_);                 // input_quant
+}
+
+bool FakeLearnedScaleQuantPerChannelGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
+                                                          const std::vector<AddressPtr> &workspace,
+                                                          const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  float *grad_input = GetDeviceAddress<float>(outputs, 0);
+  float *grad_alpha = GetDeviceAddress<float>(outputs, 1);
+  float *gradient = GetDeviceAddress<float>(inputs, 0);
+  float *input = GetDeviceAddress<float>(inputs, 1);
+  float *input_alpha = GetDeviceAddress<float>(inputs, 2);
+  float *input_quant_max = GetDeviceAddress<float>(inputs, 3);
+  float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
+  float *input_quant = GetDeviceAddress<float>(workspace, 1);
+
+  MS_EXCEPTION_IF_NULL(grad_input);
+  MS_EXCEPTION_IF_NULL(grad_alpha);
+  MS_EXCEPTION_IF_NULL(gradient);
+  MS_EXCEPTION_IF_NULL(input);
+  MS_EXCEPTION_IF_NULL(input_alpha);
+  MS_EXCEPTION_IF_NULL(input_quant_max);
+  MS_EXCEPTION_IF_NULL(input_div_alpha);
+  MS_EXCEPTION_IF_NULL(input_quant);
+  const int kChannelLen = num_channels_;
+  float alpha_no_grad[kChannelLen];
+  memset_s(alpha_no_grad, kChannelLen * sizeof(float), 0, kChannelLen * sizeof(float));
+
+  if (global_step_ >= quant_delay_) {
+    CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
+                              cudaMemcpyAsync(grad_alpha, alpha_no_grad, sizeof(float) * num_channels_,
+                                              cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                              "Copy gpu memory failed");
+    CalLSQNudgePerChannel(input, quant_num_, input_alpha, input_quant_max, input_div_alpha, input_quant, neg_trunc_,
+                          num_channels_, reinterpret_cast<cudaStream_t>(stream_ptr));
+    CalFakeLearnedScaleQuantPerChannelGrad(grad_input, grad_alpha, gradient, quant_num_, input_div_alpha, input_quant,
+                                           neg_trunc_, num_channels_, reinterpret_cast<cudaStream_t>(stream_ptr));
+  } else {
+    CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
+                              cudaMemcpyAsync(grad_alpha, alpha_no_grad, sizeof(float) * num_channels_,
+                                              cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                              "Copy gpu memory failed");
+    CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
+                              cudaMemcpyAsync(grad_input, gradient, input_size_, cudaMemcpyDeviceToDevice,
+                                              reinterpret_cast<cudaStream_t>(stream_ptr)),
+                              "Copy gpu memory failed");
+  }
+  global_step_++;
+  return true;
+}
+
+MS_REG_GPU_KERNEL(FakeLearnedScaleQuantPerChannelGrad, FakeLearnedScaleQuantPerChannelGradGpuKernel)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_grad_gpu_kernel.h
@ -0,0 +1,57 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PER_CHANNEL_GRAD_GPUKERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PER_CHANNEL_GRAD_GPUKERNEL_H_
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class FakeLearnedScaleQuantPerChannelGradGpuKernel : public GpuKernel {
+ public:
+  FakeLearnedScaleQuantPerChannelGradGpuKernel();
+  ~FakeLearnedScaleQuantPerChannelGradGpuKernel() = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override;
+  const std::vector<size_t> &GetOutputSizeList() const override;
+  const std::vector<size_t> &GetWorkspaceSizeList() const override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+  bool Init(const CNodePtr &kernel_node) override;
+
+ protected:
+  void InitSizeLists() override;
+
+ private:
+  size_t input_size_;
+  size_t workspace_size_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+
+  int quant_num_;
+  int quant_delay_;
+  int global_step_;
+  bool neg_trunc_;
+  int num_channels_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PER_CHANNEL_GRAD_GPUKERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perlayer_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perlayer_gpu_kernel.cc
@ -0,0 +1,121 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perlayer_gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh"
+#include <thrust/extrema.h>
+#include <thrust/pair.h>
+#include <thrust/device_vector.h>
+#include <cuda_runtime_api.h>
+
+namespace mindspore {
+namespace kernel {
+FakeLearnedScaleQuantPerLayerGpuKernel::FakeLearnedScaleQuantPerLayerGpuKernel()
+    : input_size_(0), quant_num_(1), global_step_(0), quant_delay_(0), training_(false), neg_trunc_(false) {}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerLayerGpuKernel::GetInputSizeList() const { return input_size_list_; }
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerLayerGpuKernel::GetOutputSizeList() const {
+  return output_size_list_;
+}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerLayerGpuKernel::GetWorkspaceSizeList() const {
+  return workspace_size_list_;
+}
+
+bool FakeLearnedScaleQuantPerLayerGpuKernel::Init(const CNodePtr &kernel_node) {
+  kernel_node_ = kernel_node;
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 3) {
+    MS_LOG(EXCEPTION) << "Input number is " << input_num
+                      << ", but FakeLearnedScaleQuantPerLayer GpuKernel OP needs 3 Input.";
+  }
+
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 1) {
+    MS_LOG(EXCEPTION) << "Output number is " << output_num
+                      << ", but FakeLearnedScaleQuantPerLayer GpuKernel OP needs 1 output.";
+  }
+
+  quant_delay_ = static_cast<int>(GetValue<int64_t>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay")));
+  training_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("training"));
+  neg_trunc_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
+
+  // init size
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    quant_num_ *= SizeToInt(input_shape[i]);
+  }
+  input_size_ = sizeof(float) * quant_num_;
+  InitSizeLists();
+  return true;
+}
+
+void FakeLearnedScaleQuantPerLayerGpuKernel::InitSizeLists() {
+  input_size_list_.push_back(input_size_);      // x
+  input_size_list_.push_back(sizeof(float));    // alpha
+  input_size_list_.push_back(sizeof(float));    // quant_max
+  output_size_list_.push_back(input_size_);     // y
+  workspace_size_list_.push_back(input_size_);  // input_div_alpha
+  workspace_size_list_.push_back(input_size_);  // input_quant
+}
+
+bool FakeLearnedScaleQuantPerLayerGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
+                                                    const std::vector<AddressPtr> &workspace,
+                                                    const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  float *input = GetDeviceAddress<float>(inputs, 0);
+  float *input_alpha = GetDeviceAddress<float>(inputs, 1);
+  float *input_quant_max = GetDeviceAddress<float>(inputs, 2);
+  float *output = GetDeviceAddress<float>(outputs, 0);
+  float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
+  float *input_quant = GetDeviceAddress<float>(workspace, 1);
+
+  MS_EXCEPTION_IF_NULL(input);
+  MS_EXCEPTION_IF_NULL(input_alpha);
+  MS_EXCEPTION_IF_NULL(input_quant_max);
+  MS_EXCEPTION_IF_NULL(output);
+  MS_EXCEPTION_IF_NULL(input_div_alpha);
+  MS_EXCEPTION_IF_NULL(input_quant);
+
+  if (training_) {
+    // control flow for quant_delay
+    if (global_step_ >= quant_delay_) {
+      // real launch
+      CalLSQNudgePerLayer(input, quant_num_, input_alpha, input_quant_max, input_div_alpha, input_quant, neg_trunc_,
+                          reinterpret_cast<cudaStream_t>(stream_ptr));
+      CalFakeLearnedScaleQuantPerLayer(output, quant_num_, input_alpha, input_quant,
+                                       reinterpret_cast<cudaStream_t>(stream_ptr));
+    } else {
+      CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
+                                cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
+                                                reinterpret_cast<cudaStream_t>(stream_ptr)),
+                                "Copy gpu memory failed");
+    }
+    global_step_++;
+  } else {
+    // real launch
+    CalLSQNudgePerLayer(input, quant_num_, input_alpha, input_quant_max, input_div_alpha, input_quant, neg_trunc_,
+                        reinterpret_cast<cudaStream_t>(stream_ptr));
+    CalFakeLearnedScaleQuantPerLayer(output, quant_num_, input_alpha, input_quant,
+                                     reinterpret_cast<cudaStream_t>(stream_ptr));
+  }
+
+  return true;
+}
+
+MS_REG_GPU_KERNEL(FakeLearnedScaleQuantPerLayer, FakeLearnedScaleQuantPerLayerGpuKernel)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perlayer_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perlayer_gpu_kernel.h
@ -0,0 +1,56 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PERLAYER_GPUKERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PERLAYER_GPUKERNEL_H_
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class FakeLearnedScaleQuantPerLayerGpuKernel : public GpuKernel {
+ public:
+  FakeLearnedScaleQuantPerLayerGpuKernel();
+  ~FakeLearnedScaleQuantPerLayerGpuKernel() = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override;
+  const std::vector<size_t> &GetOutputSizeList() const override;
+  const std::vector<size_t> &GetWorkspaceSizeList() const override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+  bool Init(const CNodePtr &kernel) override;
+
+ protected:
+  void InitSizeLists() override;
+
+ private:
+  size_t input_size_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+
+  int quant_num_;
+  int global_step_;
+  int quant_delay_;
+  bool training_;
+  bool neg_trunc_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PERLAYER_GPUKERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.cc
@ -0,0 +1,130 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/fake_learned_scale_quant_perlayer_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+FakeLearnedScaleQuantPerLayerGradGpuKernel::FakeLearnedScaleQuantPerLayerGradGpuKernel()
+    : input_size_(0), workspace_size_(0), quant_num_(1), quant_delay_(0), global_step_(0), neg_trunc_(false) {}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerLayerGradGpuKernel::GetInputSizeList() const {
+  return input_size_list_;
+}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerLayerGradGpuKernel::GetOutputSizeList() const {
+  return output_size_list_;
+}
+
+const std::vector<size_t> &FakeLearnedScaleQuantPerLayerGradGpuKernel::GetWorkspaceSizeList() const {
+  return workspace_size_list_;
+}
+
+bool FakeLearnedScaleQuantPerLayerGradGpuKernel::Init(const CNodePtr &kernel_node) {
+  kernel_node_ = kernel_node;
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+
+  if (input_num != 4) {
+    MS_LOG(EXCEPTION) << "Input number is " << input_num
+                      << ", but FakeLearnedScaleQuantPerLayerGrad GpuKernel OP needs 4 input.";
+  }
+
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 2) {
+    MS_LOG(EXCEPTION) << "Output number is " << output_num
+                      << ", but FakeLearnedScaleQuantPerLayerGrad GpuKernel OP needs 2 output.";
+  }
+
+  quant_delay_ = static_cast<int>(GetValue<int64_t>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay")));
+  if (quant_delay_ < 0) {
+    MS_LOG(EXCEPTION) << "Attr \'quant_delay_\' " << quant_delay_ << " is less than 0, require larger than 0.";
+  }
+
+  neg_trunc_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("neg_trunc"));
+
+  // init size
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    quant_num_ *= SizeToInt(input_shape[i]);
+  }
+  input_size_ = sizeof(float);
+  for (size_t i = 0; i < input_shape.size(); i++) {
+    input_size_ *= input_shape[i];
+  }
+  InitSizeLists();
+  return true;
+}
+
+void FakeLearnedScaleQuantPerLayerGradGpuKernel::InitSizeLists() {
+  input_size_list_.push_back(input_size_);      // gradient
+  input_size_list_.push_back(input_size_);      // input
+  input_size_list_.push_back(sizeof(float));    // alpha
+  input_size_list_.push_back(sizeof(float));    // quant_max
+  output_size_list_.push_back(input_size_);     //  grad_input
+  output_size_list_.push_back(sizeof(float));   // grad_alpha
+  workspace_size_list_.push_back(input_size_);  // input_div_alpha
+  workspace_size_list_.push_back(input_size_);  // input_quant
+}
+
+bool FakeLearnedScaleQuantPerLayerGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
+                                                        const std::vector<AddressPtr> &workspace,
+                                                        const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  float *grad_input = GetDeviceAddress<float>(outputs, 0);
+  float *grad_alpha = GetDeviceAddress<float>(outputs, 1);
+  float *gradient = GetDeviceAddress<float>(inputs, 0);
+  float *input = GetDeviceAddress<float>(inputs, 1);
+  float *input_alpha = GetDeviceAddress<float>(inputs, 2);
+  float *input_quant_max = GetDeviceAddress<float>(inputs, 3);
+  float *input_div_alpha = GetDeviceAddress<float>(workspace, 0);
+  float *input_quant = GetDeviceAddress<float>(workspace, 1);
+
+  MS_EXCEPTION_IF_NULL(grad_input);
+  MS_EXCEPTION_IF_NULL(grad_alpha);
+  MS_EXCEPTION_IF_NULL(gradient);
+  MS_EXCEPTION_IF_NULL(input);
+  MS_EXCEPTION_IF_NULL(input_alpha);
+  MS_EXCEPTION_IF_NULL(input_quant_max);
+  MS_EXCEPTION_IF_NULL(input_div_alpha);
+  MS_EXCEPTION_IF_NULL(input_quant);
+
+  const float alpha_no_grad[1] = {0.f};
+
+  if (global_step_ >= quant_delay_) {
+    CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
+                              cudaMemcpyAsync(grad_alpha, alpha_no_grad, sizeof(float), cudaMemcpyHostToDevice,
+                                              reinterpret_cast<cudaStream_t>(stream_ptr)),
+                              "Copy gpu memory failed");
+    CalLSQNudgePerLayer(input, quant_num_, input_alpha, input_quant_max, input_div_alpha, input_quant, neg_trunc_,
+                        reinterpret_cast<cudaStream_t>(stream_ptr));
+    CalFakeLearnedScaleQuantPerLayerGrad(grad_input, grad_alpha, gradient, quant_num_, input_div_alpha, input_quant,
+                                         neg_trunc_, reinterpret_cast<cudaStream_t>(stream_ptr));
+  } else {
+    CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
+                              cudaMemcpyAsync(grad_alpha, alpha_no_grad, sizeof(float), cudaMemcpyHostToDevice,
+                                              reinterpret_cast<cudaStream_t>(stream_ptr)),
+                              "Copy gpu memory failed");
+    CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
+                              cudaMemcpyAsync(grad_input, gradient, input_size_, cudaMemcpyDeviceToDevice,
+                                              reinterpret_cast<cudaStream_t>(stream_ptr)),
+                              "Copy gpu memory failed");
+  }
+  global_step_++;
+  return true;
+}
+
+MS_REG_GPU_KERNEL(FakeLearnedScaleQuantPerLayerGrad, FakeLearnedScaleQuantPerLayerGradGpuKernel)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perlayer_grad_gpu_kernel.h
@ -0,0 +1,56 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PERLAYER_GRAD_GPUKERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PERLAYER_GRAD_GPUKERNEL_H_
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class FakeLearnedScaleQuantPerLayerGradGpuKernel : public GpuKernel {
+ public:
+  FakeLearnedScaleQuantPerLayerGradGpuKernel();
+  ~FakeLearnedScaleQuantPerLayerGradGpuKernel() = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override;
+  const std::vector<size_t> &GetOutputSizeList() const override;
+  const std::vector<size_t> &GetWorkspaceSizeList() const override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+  bool Init(const CNodePtr &kernel_node) override;
+
+ protected:
+  void InitSizeLists() override;
+
+ private:
+  size_t input_size_;
+  size_t workspace_size_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+
+  int quant_num_;
+  int quant_delay_;
+  int global_step_;
+  bool neg_trunc_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_FAKE_LEARNED_SCALE_QUANT_PERLAYER_GRAD_GPUKERNEL_H_
--- a/mindspore/ccsrc/backend/optimizer/ascend/mindir/fake_learned_scale_quant_grad_unify_mindir.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/mindir/fake_learned_scale_quant_grad_unify_mindir.cc
@ -0,0 +1,218 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/ascend/mindir/fake_learned_scale_quant_grad_unify_mindir.h"
+
+#include <vector>
+#include <memory>
+
+#include "utils/utils.h"
+#include "utils/ms_context.h"
+#include "backend/optimizer/common/helper.h"
+#include "runtime/device/kernel_info.h"
+#include "backend/session/anf_runtime_algorithm.h"
+#include "utils/trace_base.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+void CreateOutputsOfLSQPerLayerGradD(const FuncGraphPtr &graph, const CNodePtr &lsq_perlayer_grad_node,
+                                     std::vector<AnfNodePtr> *lsq_perlayer_grad_d_outputs) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(lsq_perlayer_grad_node);
+  const auto &lsq_perlayer_grad_inputs = lsq_perlayer_grad_node->inputs();
+  if (lsq_perlayer_grad_inputs.size() < kFakeLearnedScaleQuantGradInputNum) {
+    MS_LOG(EXCEPTION) << "lsq_perlayer_grad_node has wrong inputs size."
+                      << " trace: " << trace::DumpSourceLines(lsq_perlayer_grad_node);
+  }
+  std::vector<AnfNodePtr> lsq_perlayer_grad_d_inputs = {
+    NewValueNode(std::make_shared<Primitive>(kFakeLearnedScaleQuantPerLayerGradDOpName)), lsq_perlayer_grad_inputs[1],
+    lsq_perlayer_grad_inputs[2], lsq_perlayer_grad_inputs[3], lsq_perlayer_grad_inputs[4]};
+  auto lsq_perlayer_grad_d = graph->NewCNode(lsq_perlayer_grad_d_inputs);
+  MS_EXCEPTION_IF_NULL(lsq_perlayer_grad_d);
+  lsq_perlayer_grad_d->set_scope(lsq_perlayer_grad_node->scope());
+
+  auto types = {AnfAlgo::GetOutputInferDataType(lsq_perlayer_grad_node, 0),
+                AnfAlgo::GetOutputInferDataType(lsq_perlayer_grad_node, 0)};
+  auto shapes = {AnfAlgo::GetOutputInferShape(lsq_perlayer_grad_node, 0),
+                 AnfAlgo::GetOutputInferShape(lsq_perlayer_grad_node, 0)};
+  AnfAlgo::SetOutputInferTypeAndShape(types, shapes, lsq_perlayer_grad_d.get());
+
+  AnfAlgo::CopyNodeAttr(kAttrNeg_trunc, lsq_perlayer_grad_node, lsq_perlayer_grad_d);
+  CreateMultipleOutputsOfAnfNode(graph, lsq_perlayer_grad_d, kFakeLearnedScaleQuantGradDOutputNum,
+                                 lsq_perlayer_grad_d_outputs);
+}
+
+void CreateOutputsOfLSQPerLayerReduceGrad(const FuncGraphPtr &graph, const CNodePtr &lsq_perlayer_grad_node,
+                                          const std::vector<AnfNodePtr> &lsq_perlayer_grad_d_outputs,
+                                          std::vector<AnfNodePtr> *lsq_perlayer_reduce_grad_outputs) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(lsq_perlayer_grad_node);
+  MS_EXCEPTION_IF_NULL(lsq_perlayer_reduce_grad_outputs);
+  const auto &lsq_perlayer_grad_inputs = lsq_perlayer_grad_node->inputs();
+  if (lsq_perlayer_grad_inputs.size() < kFakeLearnedScaleQuantGradInputNum) {
+    MS_LOG(EXCEPTION) << "lsq_perlayer_grad_node has wrong inputs size"
+                      << " trace: " << trace::DumpSourceLines(lsq_perlayer_grad_node);
+  }
+  if (lsq_perlayer_grad_d_outputs.size() != kFakeLearnedScaleQuantGradDOutputNum) {
+    MS_LOG(EXCEPTION) << "lsq_perlayer_grad_d_outputs has wrong size"
+                      << " trace: " << trace::DumpSourceLines(lsq_perlayer_grad_node);
+  }
+  std::vector<AnfNodePtr> lsq_perlayer_reduce_grad_inputs = {
+    NewValueNode(std::make_shared<Primitive>(kFakeLearnedScaleQuantPerLayerGradDReduceOpName)),
+    lsq_perlayer_grad_d_outputs[1]};
+  auto lsq_perlayer_reduce_grad = graph->NewCNode(lsq_perlayer_reduce_grad_inputs);
+  MS_EXCEPTION_IF_NULL(lsq_perlayer_reduce_grad);
+  lsq_perlayer_reduce_grad->set_scope(lsq_perlayer_grad_node->scope());
+
+  auto types = {AnfAlgo::GetOutputInferDataType(lsq_perlayer_grad_node, 1)};
+  auto shapes = {AnfAlgo::GetOutputInferShape(lsq_perlayer_grad_node, 1)};
+  AnfAlgo::SetOutputInferTypeAndShape(types, shapes, lsq_perlayer_reduce_grad.get());
+
+  (*lsq_perlayer_reduce_grad_outputs).push_back(lsq_perlayer_reduce_grad);
+}
+
+void CreateOutputsOfLSQPerChannelGradD(const FuncGraphPtr &graph, const CNodePtr &lsq_perchannel_grad_node,
+                                       std::vector<AnfNodePtr> *lsq_perchannel_grad_d_outputs) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(lsq_perchannel_grad_node);
+  const auto &lsq_perchannel_grad_inputs = lsq_perchannel_grad_node->inputs();
+  if (lsq_perchannel_grad_inputs.size() < kFakeLearnedScaleQuantGradInputNum) {
+    MS_LOG(EXCEPTION) << "lsq_perchannel_grad_node has wrong inputs size."
+                      << " trace: " << trace::DumpSourceLines(lsq_perchannel_grad_node);
+  }
+  std::vector<AnfNodePtr> lsq_perchannel_grad_d_inputs = {
+    NewValueNode(std::make_shared<Primitive>(kFakeLearnedScaleQuantPerChannelGradDOpName)),
+    lsq_perchannel_grad_inputs[1], lsq_perchannel_grad_inputs[2], lsq_perchannel_grad_inputs[3],
+    lsq_perchannel_grad_inputs[4]};
+  auto lsq_perchannel_grad_d = graph->NewCNode(lsq_perchannel_grad_d_inputs);
+  MS_EXCEPTION_IF_NULL(lsq_perchannel_grad_d);
+  lsq_perchannel_grad_d->set_scope(lsq_perchannel_grad_node->scope());
+
+  auto types = {AnfAlgo::GetOutputInferDataType(lsq_perchannel_grad_node, 0),
+                AnfAlgo::GetOutputInferDataType(lsq_perchannel_grad_node, 0)};
+  auto shapes = {AnfAlgo::GetOutputInferShape(lsq_perchannel_grad_node, 0),
+                 AnfAlgo::GetOutputInferShape(lsq_perchannel_grad_node, 0)};
+  AnfAlgo::SetOutputInferTypeAndShape(types, shapes, lsq_perchannel_grad_d.get());
+
+  AnfAlgo::CopyNodeAttr(kAttrNeg_trunc, lsq_perchannel_grad_node, lsq_perchannel_grad_d);
+  AnfAlgo::CopyNodeAttr(kAttrChannelAxis, lsq_perchannel_grad_node, lsq_perchannel_grad_d);
+  CreateMultipleOutputsOfAnfNode(graph, lsq_perchannel_grad_d, kFakeLearnedScaleQuantGradDOutputNum,
+                                 lsq_perchannel_grad_d_outputs);
+}
+
+void CreateOutputsOfLSQPerChannelReduceGrad(const FuncGraphPtr &graph, const CNodePtr &lsq_perchannel_grad_node,
+                                            const std::vector<AnfNodePtr> &lsq_perchannel_grad_d_outputs,
+                                            std::vector<AnfNodePtr> *lsq_perchannel_reduce_grad_outputs) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(lsq_perchannel_grad_node);
+  MS_EXCEPTION_IF_NULL(lsq_perchannel_reduce_grad_outputs);
+  const auto &lsq_perchannel_grad_inputs = lsq_perchannel_grad_node->inputs();
+  if (lsq_perchannel_grad_inputs.size() < kFakeLearnedScaleQuantGradInputNum) {
+    MS_LOG(EXCEPTION) << "lsq_perchannel_grad_node has wrong inputs size"
+                      << " trace: " << trace::DumpSourceLines(lsq_perchannel_grad_node);
+  }
+  if (lsq_perchannel_grad_d_outputs.size() != kFakeLearnedScaleQuantGradDOutputNum) {
+    MS_LOG(EXCEPTION) << "lsq_perchannel_grad_d_outputs has wrong size"
+                      << " trace: " << trace::DumpSourceLines(lsq_perchannel_grad_node);
+  }
+  std::vector<AnfNodePtr> lsq_perchannel_reduce_grad_inputs = {
+    NewValueNode(std::make_shared<Primitive>(kFakeLearnedScaleQuantPerChannelGradDReduceOpName)),
+    lsq_perchannel_grad_d_outputs[1]};
+  auto lsq_perchannel_reduce_grad = graph->NewCNode(lsq_perchannel_reduce_grad_inputs);
+  MS_EXCEPTION_IF_NULL(lsq_perchannel_reduce_grad);
+  lsq_perchannel_reduce_grad->set_scope(lsq_perchannel_grad_node->scope());
+
+  auto types = {AnfAlgo::GetOutputInferDataType(lsq_perchannel_grad_node, 1)};
+  auto shapes = {AnfAlgo::GetOutputInferShape(lsq_perchannel_grad_node, 1)};
+  AnfAlgo::SetOutputInferTypeAndShape(types, shapes, lsq_perchannel_reduce_grad.get());
+  AnfAlgo::CopyNodeAttr(kAttrChannelAxis, lsq_perchannel_grad_node, lsq_perchannel_reduce_grad);
+  (*lsq_perchannel_reduce_grad_outputs).push_back(lsq_perchannel_reduce_grad);
+}
+}  // namespace
+const BaseRef FakeLearnedScaleQuantPerLayerGradUnifyMindIR::DefinePattern() const {
+  VarPtr Xs = std::make_shared<SeqVar>();
+  auto prim = std::make_shared<Primitive>(kFakeLearnedScaleQuantPerLayerGradOpName);
+  return VectorRef({prim, Xs});
+}
+
+const AnfNodePtr FakeLearnedScaleQuantPerLayerGradUnifyMindIR::Process(const FuncGraphPtr &func_graph,
+                                                                       const AnfNodePtr &node, const EquivPtr &) const {
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(func_graph);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto primitive = AnfAlgo::GetCNodePrimitive(cnode);
+  MS_EXCEPTION_IF_NULL(primitive);
+
+  std::vector<AnfNodePtr> lsq_perlayer_grad_d_outputs;
+  CreateOutputsOfLSQPerLayerGradD(func_graph, cnode, &lsq_perlayer_grad_d_outputs);
+  if (lsq_perlayer_grad_d_outputs.size() != kFakeLearnedScaleQuantGradOutputNum) {
+    MS_LOG(EXCEPTION) << "fake_learned_scale_quant_perlayer_grad_d_outputs has wrong size"
+                      << " trace: " << trace::DumpSourceLines(node);
+  }
+
+  std::vector<AnfNodePtr> lsq_perlayer_reduce_grad_outputs;
+  CreateOutputsOfLSQPerLayerReduceGrad(func_graph, cnode, lsq_perlayer_grad_d_outputs,
+                                       &lsq_perlayer_reduce_grad_outputs);
+  if (lsq_perlayer_reduce_grad_outputs.size() != kSingleOutputNum) {
+    MS_LOG(EXCEPTION) << "fake_learned_scale_quant_perlayer_reduce_grad_outputs has wrong size"
+                      << " trace: " << trace::DumpSourceLines(node);
+  }
+
+  std::vector<AnfNodePtr> make_tuple_inputs = {NewValueNode(prim::kPrimMakeTuple), lsq_perlayer_grad_d_outputs[0],
+                                               lsq_perlayer_reduce_grad_outputs[0]};
+  auto make_tuple = func_graph->NewCNode(make_tuple_inputs);
+  return make_tuple;
+}
+
+const BaseRef FakeLearnedScaleQuantPerChannelGradUnifyMindIR::DefinePattern() const {
+  VarPtr Xs = std::make_shared<SeqVar>();
+  auto prim = std::make_shared<Primitive>(kFakeLearnedScaleQuantPerChannelGradOpName);
+  return VectorRef({prim, Xs});
+}
+
+const AnfNodePtr FakeLearnedScaleQuantPerChannelGradUnifyMindIR::Process(const FuncGraphPtr &func_graph,
+                                                                         const AnfNodePtr &node,
+                                                                         const EquivPtr &) const {
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(func_graph);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto primitive = AnfAlgo::GetCNodePrimitive(cnode);
+  MS_EXCEPTION_IF_NULL(primitive);
+
+  std::vector<AnfNodePtr> lsq_perchannel_grad_d_outputs;
+  CreateOutputsOfLSQPerChannelGradD(func_graph, cnode, &lsq_perchannel_grad_d_outputs);
+  if (lsq_perchannel_grad_d_outputs.size() != kFakeLearnedScaleQuantGradOutputNum) {
+    MS_LOG(EXCEPTION) << "fake_learned_scale_quant_perchannel_grad_d_outputs has wrong size"
+                      << " trace: " << trace::DumpSourceLines(node);
+  }
+
+  std::vector<AnfNodePtr> lsq_perchannel_reduce_grad_outputs;
+  CreateOutputsOfLSQPerChannelReduceGrad(func_graph, cnode, lsq_perchannel_grad_d_outputs,
+                                         &lsq_perchannel_reduce_grad_outputs);
+  if (lsq_perchannel_reduce_grad_outputs.size() != kSingleOutputNum) {
+    MS_LOG(EXCEPTION) << "fake_learned_scale_quant_perchannel_reduce_grad_outputs has wrong size"
+                      << " trace: " << trace::DumpSourceLines(node);
+  }
+
+  std::vector<AnfNodePtr> make_tuple_inputs = {NewValueNode(prim::kPrimMakeTuple), lsq_perchannel_grad_d_outputs[0],
+                                               lsq_perchannel_reduce_grad_outputs[0]};
+  auto make_tuple = func_graph->NewCNode(make_tuple_inputs);
+  return make_tuple;
+}
+}  // namespace opt
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/mindir/fake_learned_scale_quant_grad_unify_mindir.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/mindir/fake_learned_scale_quant_grad_unify_mindir.h
@ -0,0 +1,57 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_MINDIR_FAKE_LEARNED_SCALE_QUANT_GRAD_UNIFY_MINDIR_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_MINDIR_FAKE_LEARNED_SCALE_QUANT_GRAD_UNIFY_MINDIR_H_
+
+#include "backend/optimizer/common/optimizer.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+constexpr size_t kFakeLearnedScaleQuantGradOutputNum = 2;
+constexpr size_t kFakeLearnedScaleQuantGradInputNum = 5;
+constexpr size_t kFakeLearnedScaleQuantGradDOutputNum = 2;
+constexpr auto kFakeLearnedScaleQuantPerLayerGradOpName = "FakeLearnedScaleQuantPerLayerGrad";
+constexpr auto kFakeLearnedScaleQuantPerLayerGradDOpName = "FakeLearnedScaleQuantPerLayerGradD";
+constexpr auto kFakeLearnedScaleQuantPerLayerGradDReduceOpName = "FakeLearnedScaleQuantPerLayerGradDReduce";
+constexpr auto kFakeLearnedScaleQuantPerChannelGradOpName = "FakeLearnedScaleQuantPerChannelGrad";
+constexpr auto kFakeLearnedScaleQuantPerChannelGradDOpName = "FakeLearnedScaleQuantPerChannelGradD";
+constexpr auto kFakeLearnedScaleQuantPerChannelGradDReduceOpName = "FakeLearnedScaleQuantPerChannelGradDReduce";
+
+constexpr auto kAttrNeg_trunc = "neg_trunc";
+constexpr auto kAttrChannelAxis = "channel_axis";
+
+class FakeLearnedScaleQuantPerLayerGradUnifyMindIR : public PatternProcessPass {
+ public:
+  explicit FakeLearnedScaleQuantPerLayerGradUnifyMindIR(bool multigraph = true)
+      : PatternProcessPass("fake_learned_scale_quant_perlayer_grad_unify_mindir", multigraph) {}
+  ~FakeLearnedScaleQuantPerLayerGradUnifyMindIR() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+};
+
+class FakeLearnedScaleQuantPerChannelGradUnifyMindIR : public PatternProcessPass {
+ public:
+  explicit FakeLearnedScaleQuantPerChannelGradUnifyMindIR(bool multigraph = true)
+      : PatternProcessPass("fake_learned_scale_quant_perchannel_grad_unify_mindir", multigraph) {}
+  ~FakeLearnedScaleQuantPerChannelGradUnifyMindIR() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+};
+
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_MINDIR_FAKE_LEARNED_SCALE_QUANT_GRAD_UNIFY_MINDIR_H_
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -39,6 +39,7 @@
 #include "backend/optimizer/ascend/mindir/maxpool_with_argmax_unify_mindir.h"
 #include "backend/optimizer/ascend/mindir/conv2d_unify_mindir.h"
 #include "backend/optimizer/ascend/mindir/optimizer_unify_output.h"
+#include "backend/optimizer/ascend/mindir/fake_learned_scale_quant_grad_unify_mindir.h"
 #include "backend/optimizer/ascend/mindir/sparse_softmax_cross_entropy_with_logits_unify_mindir.h"
 #include "backend/optimizer/ascend/mindir/slice_grad_unify_mindir.h"
 #include "backend/optimizer/ascend/mindir/avg_pool_grad_unify_mindir.h"
@ -374,6 +375,8 @@ void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) {
  unify_mindir_pm->AddPass(std::make_shared<opt::MomentumUnifyOutput>());
  unify_mindir_pm->AddPass(std::make_shared<opt::RMSPropUnifyOutput>());
  unify_mindir_pm->AddPass(std::make_shared<opt::CenteredRMSPropUnifyOutput>());
+  unify_mindir_pm->AddPass(std::make_shared<opt::FakeLearnedScaleQuantPerLayerGradUnifyMindIR>());
+  unify_mindir_pm->AddPass(std::make_shared<opt::FakeLearnedScaleQuantPerChannelGradUnifyMindIR>());
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
--- a/mindspore/ccsrc/pipeline/jit/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc
@ -397,7 +397,9 @@ void ExecutorPy::GetWeightInfo(const CNodePtr &root_node, const AnfNodePtr &weig
  CNodePtr cnode = nullptr;
  auto is_quant_cnode = [](const AnfNodePtr &node) {
    return IsPrimitiveCNode(node, prim::kPrimFakeQuantPerLayer) ||
-           IsPrimitiveCNode(node, prim::kPrimFakeQuantPerChannel);
+           IsPrimitiveCNode(node, prim::kPrimFakeQuantPerChannel) ||
+           IsPrimitiveCNode(node, prim::kPrimFakeLearnedScaleQuantPerLayer) ||
+           IsPrimitiveCNode(node, prim::kPrimFakeLearnedScaleQuantPerChannel);
  };
  while (!is_quant_cnode(x)) {
    if (count >= max_depth) {
@ -452,7 +454,9 @@ std::map<std::string, std::pair<PrimitivePyAdapterPtr, std::string>> ExecutorPy:
  std::vector<AnfNodePtr> nodes = DeepScopedGraphSearchWithFilter(func_graph->get_return(), AlwaysInclude, filter);
  auto is_quant_cnode = [](const AnfNodePtr &node) {
    return IsPrimitiveCNode(node, prim::kPrimFakeQuantPerLayer) ||
-           IsPrimitiveCNode(node, prim::kPrimFakeQuantPerChannel);
+           IsPrimitiveCNode(node, prim::kPrimFakeQuantPerChannel) ||
+           IsPrimitiveCNode(node, prim::kPrimFakeLearnedScaleQuantPerLayer) ||
+           IsPrimitiveCNode(node, prim::kPrimFakeLearnedScaleQuantPerChannel);
  };
  for (const auto &node : nodes) {
    auto root_node = node->cast<CNodePtr>();
@ -461,7 +465,15 @@ std::map<std::string, std::pair<PrimitivePyAdapterPtr, std::string>> ExecutorPy:
    }
    auto weight = root_node->input(2);
    if (!is_quant_cnode(weight)) {
-      continue;
+      auto tuple_node = weight->cast<CNodePtr>();
+      if (tuple_node != nullptr) {
+        auto fake_node = tuple_node->input(1);
+        if (!is_quant_cnode(fake_node)) {
+          continue;
+        } else {
+          weight = fake_node;
+        }
+      }
    }
    // get parameter weight's name
    auto cnode = weight->cast<CNodePtr>();
--- a/mindspore/compression/export/quant_export.py
+++ b/mindspore/compression/export/quant_export.py
@ -56,6 +56,7 @@ class ExportToQuantInferNetwork:
        self.input_zero_point = round(mean)
        self.data_type = mstype.int8
        self.network = copy.deepcopy(network)
+        self.network_bk = copy.deepcopy(network)
        self.all_parameters = {p.name: p for p in self.network.get_parameters()}
        self.get_inputs_table(inputs)
        self.mean = mean
@ -83,6 +84,7 @@ class ExportToQuantInferNetwork:
        """convert network's quant subcell to deploy subcell"""
        # Calculate the scale and zero point
        w_minq_name = cell_core.fake_quant_weight.minq.name
+        w_maxq_name = cell_core.fake_quant_weight.maxq.name
        np_type = mstype.dtype_to_nptype(self.data_type)
        param_dict = dict()
        param_dict["filter_maxq"] = None
@ -102,16 +104,23 @@ class ExportToQuantInferNetwork:
                quant_utils.scale_zp_max_min_from_fake_quant_cell(fake_quant_a_out, np_type)

        info = self.quant_info_table.get(w_minq_name, None)
+        if not info:
+            info = self.quant_info_table.get(w_maxq_name, None)
        if info:
-            fake_quant_a_in_op, minq_name = info
+            _, minq_name = info
            if minq_name == 'input':
                scale_a_in, zp_a_in, param_dict["input_maxq"], param_dict["input_minq"] = \
                    self.input_scale, self.input_zero_point, 'None', 'None'
            else:
-                maxq = self.all_parameters[minq_name[:-4] + "maxq"]
-                minq = self.all_parameters[minq_name]
+                fake_quant_a_in_prefix = minq_name[:-5]
+                cells = self.network_bk.cells_and_names()
+                for cell in cells:
+                    if cell[0].endswith(fake_quant_a_in_prefix):
+                        fake_quant_a_in = cell[1]
+                        break
+
                scale_a_in, zp_a_in, param_dict["input_maxq"], param_dict["input_minq"] = \
-                    quant_utils.scale_zp_max_min_from_data(fake_quant_a_in_op, minq, maxq, np_type)
+                    quant_utils.scale_zp_max_min_from_fake_quant_cell(fake_quant_a_in, np_type)
        else:
            # skip quant layer
            scale_a_in, zp_a_in = 1.0, 0.0
@ -140,9 +149,8 @@ class ExportToQuantInferNetwork:
        weight_b = weight
        bias_b = bias
        # apply the quant
-        fake_quant_weight_op = cell_core.fake_quant_weight.fake_quant_infer
-        weight = quant_utils.weight2int(weight, scale_w, zp_w, np_type, fake_quant_weight_op.num_bits,
-                                        fake_quant_weight_op.narrow_range)
+        weight = quant_utils.weight2int(weight, scale_w, zp_w, np_type, cell_core.fake_quant_weight.num_bits,
+                                        cell_core.fake_quant_weight.narrow_range)
        if bias is not None:
            bias = Tensor(bias / scale_a_in / scale_w, mstype.int32)

--- a/mindspore/compression/quant/qat.py
+++ b/mindspore/compression/quant/qat.py
@ -22,15 +22,15 @@ aware training, MindSpore provides conversion functions to convert the trained m
 """

 import re
-
 import mindspore.context as context
-
+import numpy as np
 from ... import nn, ops
 from ..._checkparam import Validator, Rel
 from ...nn.layer import quant
 from ...ops import functional as F
 from ..common import QuantDtype
 from .quantizer import Quantizer, OptimizeOption
+from .quant_utils import compute_KL_threshold


 __all__ = ["QuantizationAwareTraining", "create_quant_config"]
@ -41,7 +41,8 @@ def create_quant_config(quant_observer=(nn.FakeQuantWithMinMaxObserver, nn.FakeQ
                        quant_dtype=(QuantDtype.INT8, QuantDtype.INT8),
                        per_channel=(False, False),
                        symmetric=(False, False),
-                        narrow_range=(False, False)):
+                        narrow_range=(False, False),
+                        mode="DEFAULT"):
    r"""
    Config the observer type of weights and data flow with quant params.

@ -62,6 +63,8 @@ def create_quant_config(quant_observer=(nn.FakeQuantWithMinMaxObserver, nn.FakeQ
            element represents data flow. Default: (False, False)
        narrow_range (Union[bool, list, tuple]): Whether the quantization algorithm uses narrow range or not.
            The first element represents weights and the second element represents data flow. Default: (False, False)
+        mode (String): Optional quantization mode, currently only `DEFAULT`(QAT) and `LEARNED_SCALE` are supported.
+            Default: ("DEFAULT")

    Returns:
        QuantConfig, Contains the observer type of weight and activation.
@ -70,10 +73,10 @@ def create_quant_config(quant_observer=(nn.FakeQuantWithMinMaxObserver, nn.FakeQ
        raise ValueError("Arg 'per_channel' second element must be 'False'.")
    weight_observer = quant_observer[0].partial_init(quant_delay=quant_delay[0], quant_dtype=quant_dtype[0],
                                                     per_channel=per_channel[0], symmetric=symmetric[0],
-                                                     narrow_range=narrow_range[0])
+                                                     narrow_range=narrow_range[0], mode=mode)
    act_observer = quant_observer[-1].partial_init(quant_delay=quant_delay[-1], quant_dtype=quant_dtype[-1],
                                                   per_channel=per_channel[-1], symmetric=symmetric[-1],
-                                                   narrow_range=narrow_range[-1])
+                                                   narrow_range=narrow_range[-1], mode=mode)
    return quant.QuantConfig(weight=weight_observer, activation=act_observer)


@ -103,14 +106,24 @@ class _AddFakeQuantAfterSubCell(nn.Cell):
    def __init__(self, subcell, **kwargs):
        super(_AddFakeQuantAfterSubCell, self).__init__(auto_prefix=False)
        self.subcell = subcell
-        self.fake_quant_act = quant.FakeQuantWithMinMaxObserver(min_init=-6,
-                                                                max_init=6,
+        self.mode = "DEFAULT"
+        self.max_init = 6
+        self.min_init = -6
+
+        if OptimizeOption.LEARNED_SCALE in kwargs["optimize_option"]:
+            self.mode = "LEARNED_SCALE"
+            self.max_init = 16
+            self.min_init = -16
+
+        self.fake_quant_act = quant.FakeQuantWithMinMaxObserver(min_init=self.min_init,
+                                                                max_init=self.max_init,
                                                                ema=True,
                                                                quant_dtype=kwargs["quant_dtype"],
                                                                quant_delay=kwargs["quant_delay"],
                                                                per_channel=kwargs["per_channel"],
                                                                symmetric=kwargs["symmetric"],
-                                                                narrow_range=kwargs["narrow_range"])
+                                                                narrow_range=kwargs["narrow_range"],
+                                                                mode=self.mode)

    def construct(self, *data):
        output = self.subcell(*data)
@ -128,7 +141,8 @@ class QuantizationAwareTraining(Quantizer):
        quant_delay (Union[int, list, tuple]): Number of steps after which weights and activations are quantized during
            eval. The first element represents weights and second element represents data flow. Default: (0, 0)
        quant_dtype (Union[QuantDtype, list, tuple]): Datatype to use for quantize weights and activations. The first
-            element represents weights and second element represents data flow.
+            element represents weights and second element represents data flow. It is necessary to consider the
+            precision support of hardware devices in the practical quantization infer scenario.
            Default: (QuantDtype.INT8, QuantDtype.INT8)
        per_channel (Union[bool, list, tuple]):  Quantization granularity based on layer or on channel. If `True`
            then base on per channel otherwise base on per layer. The first element represents weights
@ -139,7 +153,11 @@ class QuantizationAwareTraining(Quantizer):
        narrow_range (Union[bool, list, tuple]): Whether the quantization algorithm uses narrow range or not.
            The first element represents weights and the second element represents data flow. Default: (False, False)
        optimize_option (Union[OptimizeOption, list, tuple]): Specifies the quant algorithm and options, currently only
-            support QAT. Default: OptimizeOption.QAT
+            support QAT and LEARNED_SCALE (Note that, if both QAT and LEARNED_SCALE are configured, LEARNED_SCALE has
+            a higher priority. LEARNED_SCALE currently only work under some constraints, which includes: freeze_bn=0,
+            quant_delay=0, symmetric=Ture, narrow_range=True, More specifically, for operators such as ReLu and ReLu6,
+            which only have positive values, we add a negative truncation to optimize this scenario, and narrow_range
+            will automatically match to False). Default: OptimizeOption.QAT
        one_conv_fold (bool): Flag to used one conv bn fold ops for simulation inference operation. Default: True.

    Examples:
@ -218,11 +236,27 @@ class QuantizationAwareTraining(Quantizer):
        self.one_conv_fold = Validator.check_bool(one_conv_fold, "one conv fold")
        self._convert_method_map = {nn.Conv2dBnAct: self._convert_conv,
                                    nn.DenseBnAct: self._convert_dense}
+        self.mode = "DEFAULT"
+        if OptimizeOption.LEARNED_SCALE in self.optimize_option:
+            self.mode = "LEARNED_SCALE"
+            if not self.weight_symmetric or not self.act_symmetric:
+                raise ValueError("OptimizeOption.LEARNED_SCALE currently only support "
+                                 "symmetric=(True, True) for quant")
+            if not self.weight_range or not self.act_range:
+                raise ValueError("OptimizeOption.LEARNED_SCALE currently only support narrow_range=(True, True) "
+                                 "for quant")
+            if self.freeze_bn != 0:
+                raise ValueError("OptimizeOption.LEARNED_SCALE currently only support freeze_bn equal to 0, "
+                                 "but get freeze_bn={}".format(self.freeze_bn))
+            if self.weight_qdelay != 0 or self.act_qdelay != 0:
+                raise ValueError("OptimizeOption.LEARNED_SCALE currently only support quant_delay=(0, 0)")
        self.quant_config = create_quant_config(quant_delay=quant_delay,
                                                quant_dtype=quant_dtype,
                                                per_channel=per_channel,
                                                symmetric=symmetric,
-                                                narrow_range=narrow_range)
+                                                narrow_range=narrow_range,
+                                                mode=self.mode)
+        self.eps = 1e-5

    def _convert_op_name(self, name):
        pattern = re.compile(r'([A-Z]{1})')
@ -247,7 +281,7 @@ class QuantizationAwareTraining(Quantizer):
        if context.get_context('device_target') not in support_device:
            raise KeyError("Unsupported {} device target.".format(context.get_context('device_target')))

-        if OptimizeOption.QAT in self.optimize_option:
+        if OptimizeOption.QAT in self.optimize_option or OptimizeOption.LEARNED_SCALE in self.optimize_option:
            network.update_cell_prefix()
            network = self._convert_subcells2quant(network)
            network.update_cell_type("quant")
@ -274,7 +308,18 @@ class QuantizationAwareTraining(Quantizer):
        if isinstance(network, nn.SequentialCell) and change:
            network.cell_list = list(network.cells())

-        # add FakeQuant OP after OP in while list
+        # add FakeQuant OP after OP in white list, but not including those wrapped in the below quantization cell.
+        if isinstance(network, (nn.FakeQuantWithMinMaxObserver,
+                                nn.Conv2dBnFoldQuantOneConv,
+                                nn.Conv2dBnFoldQuant,
+                                nn.Conv2dBnWithoutFoldQuant,
+                                nn.Conv2dQuant,
+                                nn.DenseQuant,
+                                nn.ActQuant,
+                                nn.TensorAddQuant,
+                                nn.MulQuant)):
+            return network
+
        add_list = []
        for name in network.__dict__:
            if name[0] == '_':
@ -289,7 +334,8 @@ class QuantizationAwareTraining(Quantizer):
                                                  quant_delay=self.act_qdelay,
                                                  per_channel=self.act_channel,
                                                  symmetric=self.act_symmetric,
-                                                  narrow_range=self.act_range)
+                                                  narrow_range=self.act_range,
+                                                  optimize_option=self.optimize_option)
            prefix = self._convert_op_name(prim_op.name)
            if network.param_prefix:
                prefix = '.'.join([network.param_prefix, self._convert_op_name(prim_op.name)])
@ -302,10 +348,22 @@ class QuantizationAwareTraining(Quantizer):
        """
        convert Conv2d cell to quant cell
        """
+        min_init = -6
+        max_init = 6
+        if OptimizeOption.LEARNED_SCALE in self.optimize_option:
+            subcell_weight_para = subcell.conv.weight.data.asnumpy()
+            if subcell.has_bn:
+                scale_factor = (subcell.batchnorm.gamma.data.asnumpy() /
+                                np.sqrt(subcell.batchnorm.moving_variance.data.asnumpy() + self.eps))
+                subcell_weight_para = subcell_weight_para * scale_factor.reshape(-1, 1, 1, 1)
+            min_init, max_init = self._KL_init(subcell_weight_para, self.weight_dtype)
+        self.quant_config = self.quant_config._replace(
+            weight=self.quant_config.weight.partial_init(min_init=min_init, max_init=max_init))
+
        conv_inner = subcell.conv
        if subcell.has_bn:
+            bn_inner = subcell.batchnorm
            if self.bn_fold:
-                bn_inner = subcell.batchnorm
                if self.one_conv_fold:
                    conv_inner = quant.Conv2dBnFoldQuantOneConv(conv_inner.in_channels,
                                                                conv_inner.out_channels,
@ -344,11 +402,7 @@ class QuantizationAwareTraining(Quantizer):
                conv_inner.beta = subcell.batchnorm.beta
                conv_inner.moving_mean = subcell.batchnorm.moving_mean
                conv_inner.moving_variance = subcell.batchnorm.moving_variance
-                del subcell.batchnorm
-                subcell.batchnorm = None
-                subcell.has_bn = False
            else:
-                bn_inner = subcell.batchnorm
                conv_inner = quant.Conv2dBnWithoutFoldQuant(conv_inner.in_channels,
                                                            conv_inner.out_channels,
                                                            kernel_size=conv_inner.kernel_size,
@ -368,20 +422,15 @@ class QuantizationAwareTraining(Quantizer):
                conv_inner.batchnorm.beta = subcell.batchnorm.beta
                conv_inner.batchnorm.moving_mean = subcell.batchnorm.moving_mean
                conv_inner.batchnorm.moving_variance = subcell.batchnorm.moving_variance
-                del subcell.batchnorm
-                subcell.batchnorm = None
-                subcell.has_bn = False
+            del subcell.batchnorm
+            subcell.batchnorm = None
+            subcell.has_bn = False
        else:
-            conv_inner = quant.Conv2dQuant(conv_inner.in_channels,
-                                           conv_inner.out_channels,
-                                           kernel_size=conv_inner.kernel_size,
-                                           stride=conv_inner.stride,
-                                           pad_mode=conv_inner.pad_mode,
-                                           padding=conv_inner.padding,
-                                           dilation=conv_inner.dilation,
-                                           group=conv_inner.group,
-                                           has_bias=conv_inner.has_bias,
-                                           quant_config=self.quant_config,
+            conv_inner = quant.Conv2dQuant(conv_inner.in_channels, conv_inner.out_channels,
+                                           kernel_size=conv_inner.kernel_size, stride=conv_inner.stride,
+                                           pad_mode=conv_inner.pad_mode, padding=conv_inner.padding,
+                                           dilation=conv_inner.dilation, group=conv_inner.group,
+                                           has_bias=conv_inner.has_bias, quant_config=self.quant_config,
                                           quant_dtype=self.weight_dtype)
        # change original network Conv2D OP parameters to quant network
        conv_inner.weight = subcell.conv.weight
@ -392,18 +441,28 @@ class QuantizationAwareTraining(Quantizer):
            subcell.activation = self._convert_activation(subcell.activation)
        elif subcell.after_fake:
            subcell.has_act = True
-            subcell.activation = _AddFakeQuantAfterSubCell(F.identity,
-                                                           quant_dtype=self.act_dtype,
-                                                           quant_delay=self.act_qdelay,
-                                                           per_channel=self.act_channel,
-                                                           symmetric=self.act_symmetric,
-                                                           narrow_range=self.act_range)
+            subcell.activation = _AddFakeQuantAfterSubCell(F.identity, quant_dtype=self.act_dtype,
+                                                           quant_delay=self.act_qdelay, per_channel=self.act_channel,
+                                                           symmetric=self.act_symmetric, narrow_range=self.act_range,
+                                                           optimize_option=self.optimize_option)
        return subcell

    def _convert_dense(self, subcell):
        """
        convert dense cell to quant cell
        """
+        min_init = -6
+        max_init = 6
+        if OptimizeOption.LEARNED_SCALE in self.optimize_option:
+            subcell_weight_para = subcell.dense.weight.data.asnumpy()
+            if subcell.has_bn:
+                scale_factor = (subcell.batchnorm.gamma.data.asnumpy() /
+                                np.sqrt(subcell.batchnorm.moving_variance.data.asnumpy() + self.eps))
+                subcell_weight_para = subcell_weight_para * scale_factor.reshape(-1, 1, 1, 1)
+            min_init, max_init = self._KL_init(subcell_weight_para, self.weight_dtype)
+        self.quant_config = self.quant_config._replace(
+            weight=self.quant_config.weight.partial_init(min_init=min_init, max_init=max_init))
+
        dense_inner = subcell.dense
        dense_inner = quant.DenseQuant(dense_inner.in_channels,
                                       dense_inner.out_channels,
@ -424,7 +483,8 @@ class QuantizationAwareTraining(Quantizer):
                                                           quant_delay=self.act_qdelay,
                                                           per_channel=self.act_channel,
                                                           symmetric=self.act_symmetric,
-                                                           narrow_range=self.act_range)
+                                                           narrow_range=self.act_range,
+                                                           optimize_option=self.optimize_option)
        return subcell

    def _convert_activation(self, activation):
@ -434,6 +494,7 @@ class QuantizationAwareTraining(Quantizer):
        act_class = activation.__class__
        act_list = [nn.ReLU, nn.ReLU6, nn.Sigmoid]
        act_list_with_fake_before = [nn.LeakyReLU, nn.HSigmoid, nn.HSwish]
+
        if act_class in act_list:
            return quant.ActQuant(activation=activation,
                                  quant_config=self.quant_config,
@ -445,3 +506,79 @@ class QuantizationAwareTraining(Quantizer):
                                  quant_config=self.quant_config,
                                  quant_dtype=self.act_dtype)
        raise ValueError("Unsupported activation in auto quant: ", act_class)
+
+    def _KL_init(self, subcell_weight_para, weight_dtype):
+        """
+        Calculate the value of max_init and min_init with compute_KL_threshold.
+        """
+        if self.weight_channel:
+            max_init = [compute_KL_threshold(weight_para_each, weight_dtype)
+                        for weight_para_each in subcell_weight_para]
+            min_init = [-x for x in max_init]
+        else:
+            max_init = [compute_KL_threshold(subcell_weight_para, weight_dtype)]
+            min_init = [-x for x in max_init]
+        return min_init, max_init
+
+    def set_mixed_bits(self, network, strategy):
+        r"""
+        Set network's quantization strategy, this function is currently only valid for `LEARNED_SCALE`
+        optimize_option.
+        Input:
+            network (Cell): input network
+            strategy (List): the quantization strategy for layers that need to be quantified (eg. [[8], [8],
+            ..., [6], [4], [8]]), currently only the quant_dtype for weights of the dense layer and the
+            convolution layer is supported.
+        Output:
+            network (Cell)
+        """
+        if OptimizeOption.LEARNED_SCALE not in self.optimize_option:
+            raise ValueError("The `set_mixed_bits` function is currently only valid for `LEARNED_SCALE` "
+                             "optimize_option.")
+
+        self.quantizable_idx = []
+        pass_cell = None
+        for i, cell_and_name in enumerate(network.cells_and_names()):
+            cell = cell_and_name[1]
+            if isinstance(cell, (nn.Conv2dBnAct, nn.DenseBnAct)) and cell is not pass_cell:
+                self.quantizable_idx.append(i)
+
+        assert len(self.quantizable_idx) == len(strategy)
+        quantizable_layer_bit_dict = {idx: bit for idx, bit in zip(self.quantizable_idx, strategy)}
+        type_map = {
+            QuantDtype.INT2.num_bits: QuantDtype.INT2,
+            QuantDtype.INT3.num_bits: QuantDtype.INT3,
+            QuantDtype.INT4.num_bits: QuantDtype.INT4,
+            QuantDtype.INT5.num_bits: QuantDtype.INT5,
+            QuantDtype.INT6.num_bits: QuantDtype.INT6,
+            QuantDtype.INT7.num_bits: QuantDtype.INT7,
+            QuantDtype.INT8.num_bits: QuantDtype.INT8
+        }
+        for i, cell_and_name in enumerate(network.cells_and_names()):
+            cell = cell_and_name[1]
+            if i not in self.quantizable_idx:
+                continue
+            else:
+                if isinstance(cell, (nn.Conv2dBnAct, nn.DenseBnAct)):
+                    cell.weight_dtype = type_map[quantizable_layer_bit_dict[i][0]]
+                    if isinstance(cell, nn.Conv2dBnAct):
+                        subcell_weight_para = cell.conv.weight.data.asnumpy()
+                        if hasattr(cell.conv, 'gamma'):
+                            scale_factor = (cell.conv.gamma.data.asnumpy() /
+                                            np.sqrt(cell.conv.moving_variance.data.asnumpy() + self.eps))
+                            subcell_weight_para = subcell_weight_para * scale_factor.reshape(-1, 1, 1, 1)
+                        min_init, max_init = self._KL_init(subcell_weight_para, cell.weight_dtype)
+                        cell.conv.fake_quant_weight.reset(quant_dtype=cell.weight_dtype,
+                                                          min_init=min_init,
+                                                          max_init=max_init)
+                    elif isinstance(cell, nn.DenseBnAct):
+                        subcell_weight_para = cell.dense.weight.data.asnumpy()
+                        if hasattr(cell.dense, 'gamma'):
+                            scale_factor = (cell.dense.gamma.data.asnumpy() /
+                                            np.sqrt(cell.dense.moving_variance.data.asnumpy() + self.eps))
+                            subcell_weight_para = subcell_weight_para * scale_factor.reshape(-1, 1, 1, 1)
+                        min_init, max_init = self._KL_init(subcell_weight_para, cell.weight_dtype)
+                        cell.dense.fake_quant_weight.reset(quant_dtype=cell.weight_dtype,
+                                                           min_init=min_init,
+                                                           max_init=max_init)
+        return network
--- a/mindspore/compression/quant/quant_utils.py
+++ b/mindspore/compression/quant/quant_utils.py
@ -15,9 +15,10 @@
 """Quantization utils."""

 import numpy as np
+from mindspore._checkparam import Validator
+from ... import nn

-
-__all__ = ["load_nonquant_param_into_quant_net"]
+__all__ = ["load_nonquant_param_into_quant_net", "query_quant_layers"]


 def cal_quantization_params(input_min,
@ -25,7 +26,8 @@ def cal_quantization_params(input_min,
                            data_type,
                            num_bits=8,
                            symmetric=False,
-                            narrow_range=False):
+                            narrow_range=False,
+                            neg_trunc=False):
    r"""
    Calculate quantization params for scale and zero point.

@ -36,6 +38,7 @@ def cal_quantization_params(input_min,
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Whether the quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): Whether the quantization algorithm uses narrow range or not. Default: False.
+        neg_trunc (bool): Whether the quantization algorithm uses negative truncation or not. Default: False.

    Returns:
        scale (numpy.ndarray): quantization param.
@ -65,17 +68,14 @@ def cal_quantization_params(input_min,
        quant_min = quant_min + 1

    # calculate scale
-    if symmetric:
+    if symmetric and not neg_trunc:
        input_max = np.maximum(-input_min, input_max)
        input_min = -input_max
    scale = (input_max - input_min) / (quant_max - quant_min)

    # calculate zero point
-    if symmetric:
-        zp = np.zeros(input_min.shape)
-    else:
-        zp_double = quant_min - input_min / scale
-        zp = np.floor(zp_double + 0.5)
+    zp_double = quant_min - input_min / scale
+    zp = np.floor(zp_double + 0.5)

    return scale, zp

@ -135,16 +135,16 @@ def weight2int(data, scale, zero_point, data_type, num_bits=8, narrow_range=Fals


 def scale_zp_max_min_from_fake_quant_cell(cell, data_type):
-    """Get calculate quantization params for scale, zero point, max and min from `FakeQuantWithMinMax`."""
+    """Get calculate quantization params for scale, zero point, max and min from `FakeQuantWithMinMaxObserver`."""
    minq = cell.minq.data.asnumpy()
    maxq = cell.maxq.data.asnumpy()
-    op = cell.fake_quant_infer

    scale, zp = cal_quantization_params(
        minq, maxq, data_type,
-        num_bits=op.num_bits,
-        symmetric=op.symmetric,
-        narrow_range=op.narrow_range)
+        num_bits=cell.num_bits,
+        symmetric=cell.symmetric,
+        narrow_range=cell.narrow_range,
+        neg_trunc=cell.neg_trunc)
    return scale, zp, maxq, minq


@ -267,6 +267,80 @@ def without_fold_batchnorm(weight, cell_quant):
    return weight, bias


+def compute_KL_threshold(data, bitwidth):
+    r"""
+    Using KL-J Distance to calculate the clip threshold.
+
+    Args:
+        - **data** (NumpyArray) - Data observed to calculate the threshold for quantization,
+        - **bitwidth** (QuantDtype) - The datatype of quantization.
+    Outputs:
+        Tensor with Shape 1. Threshold to calculate the data.
+    """
+    bitwidth = bitwidth.num_bits
+
+    data_min = 0
+    data_max = np.abs(data).max()
+    if data_max < 1e-5:
+        return 1e-5
+    hist, bin_edges = np.histogram(np.abs(data), bins='sqrt', range=(data_min, data_max), density=True)
+    hist = hist / np.sum(hist)
+    cumsum = np.cumsum(hist)
+    bit_pow_range = pow(2, int(bitwidth) - 1)
+    threshold = []
+    scaling_factor = []
+    kl = []
+    if bit_pow_range + 1 > len(bin_edges) - 1:
+        th_layer_out = bin_edges[-1]
+        return float(th_layer_out)
+    for i in range(bit_pow_range + 1, len(bin_edges), 1):
+        threshold_tmp = (i + 0.5) * (bin_edges[1] - bin_edges[0])
+        threshold = np.concatenate((threshold, [threshold_tmp]))
+        scaling_factor_tmp = threshold_tmp / (bit_pow_range - 1)
+        scaling_factor = np.concatenate((scaling_factor, [scaling_factor_tmp]))
+        # forward interpolation
+        cumsum_tmp = np.copy(cumsum)
+        cumsum_tmp[(i - 1):] = 1
+        fwd_x = np.linspace(0.0, 1.0, bit_pow_range)
+        fwd_xp = np.linspace(0.0, 1.0, i)
+        fwd_fp = cumsum_tmp[:i]
+        forward_interp = np.interp(fwd_x, fwd_xp, fwd_fp)
+        # backward interpolation
+        bwd_x = np.linspace(0.0, 1.0, i)
+        bwd_xp = np.linspace(0.0, 1.0, bit_pow_range)
+        bwd_fp = forward_interp
+        backward_interp = np.interp(bwd_x, bwd_xp, bwd_fp)
+        cumsum_tmp[:i] = backward_interp
+        kl_tmp = np.sum((cumsum - cumsum_tmp) * np.log2(cumsum / cumsum_tmp))  # Kullback-Leibler-J
+        kl = np.concatenate((kl, [kl_tmp]))
+    th_layer_out = threshold[np.argmin(kl)]
+    threshold = float(th_layer_out)
+    if threshold < 1e-5:
+        threshold = 1e-5
+    return threshold
+
+
+def query_quant_layers(network):
+    r"""
+    Query the network's quantization strategy of each quantized layer and print it to the screen, note that all the
+    quantization layers are queried before graph compile optimization in the graph mode, thus may be appear some
+    redundant quantized layers, which are not exist in practical execution.
+
+    Input:
+        network (Cell): input network
+
+    Returns:
+        None
+    """
+    network = Validator.check_isinstance("network", network, nn.Cell)
+    tplt = "{0:60}\t{1:10}"
+    for cell_and_name in network.cells_and_names():
+        cell_name = cell_and_name[0]
+        cell = cell_and_name[1]
+        if isinstance(cell, nn.FakeQuantWithMinMaxObserver):
+            print(tplt.format(cell_name, cell.quant_dtype))
+
+
 def load_nonquant_param_into_quant_net(quant_model, params_dict, quant_new_params=None):
    r"""
    Load fp32 model parameters into quantization model.
@ -287,7 +361,8 @@ def load_nonquant_param_into_quant_net(quant_model, params_dict, quant_new_param
        'moving_mean': iter(list(filter(lambda item: item[0].endswith('moving_mean'), params_dict.items()))),
        'moving_variance': iter(list(filter(lambda item: item[0].endswith('moving_variance'), params_dict.items()))),
        'minq': iter(list(filter(lambda item: item[0].endswith('minq'), params_dict.items()))),
-        'maxq': iter(list(filter(lambda item: item[0].endswith('maxq'), params_dict.items())))
+        'maxq': iter(list(filter(lambda item: item[0].endswith('maxq'), params_dict.items()))),
+        'quant_max': iter(list(filter(lambda item: item[0].endswith('quant_max'), params_dict.items())))
    }

    for name, param in quant_model.parameters_and_names():
@ -300,3 +375,26 @@ def load_nonquant_param_into_quant_net(quant_model, params_dict, quant_new_param
        if value_param:
            param.set_data(value_param[1].data)
            print(f'init model param {name} with checkpoint param {value_param[0]}')
+
+
+    # Perform KL_init when learned scale quantization is executed.
+    for cell_and_name in quant_model.cells_and_names():
+        cell = cell_and_name[1]
+        if isinstance(cell, (nn.Conv2dBnFoldQuantOneConv, nn.Conv2dBnFoldQuant, nn.Conv2dBnWithoutFoldQuant,
+                             nn.Conv2dQuant, nn.DenseQuant)) and cell.fake_quant_weight.mode == "LEARNED_SCALE":
+            subcell_weight_para = cell.weight.data.asnumpy()
+            if hasattr(cell, 'gamma'):
+                scale_factor = (cell.gamma.data.asnumpy() /
+                                np.sqrt(cell.moving_variance.data.asnumpy() + 1e-5))
+                subcell_weight_para = subcell_weight_para * scale_factor.reshape(-1, 1, 1, 1)
+
+            if cell.fake_quant_weight.per_channel:
+                max_init = [compute_KL_threshold(weight_para_each, cell.fake_quant_weight.quant_dtype)
+                            for weight_para_each in subcell_weight_para]
+                min_init = [-x for x in max_init]
+            else:
+                max_init = [compute_KL_threshold(subcell_weight_para, cell.fake_quant_weight.quant_dtype)]
+                min_init = [-x for x in max_init]
+
+            cell.fake_quant_weight.reset(quant_dtype=cell.fake_quant_weight.quant_dtype,
+                                         min_init=min_init, max_init=max_init)
--- a/mindspore/compression/quant/quantizer.py
+++ b/mindspore/compression/quant/quantizer.py
@ -29,6 +29,9 @@ class OptimizeOption(Enum):
    # using quantization aware training
    QAT = "QAT"

+    # using the learned scale quantization
+    LEARNED_SCALE = "LEARNED_SCALE"
+
    def __str__(self):
        return self.value

--- a/mindspore/core/base/core_ops.h
+++ b/mindspore/core/base/core_ops.h
@ -295,6 +295,10 @@ inline const PrimitivePtr kPrimOnesLike = std::make_shared<Primitive>("OnesLike"
 inline const PrimitivePtr kPrimBpropCut = std::make_shared<Primitive>("bprop_cut");
 inline const PrimitivePtr kPrimFakeQuantPerLayer = std::make_shared<Primitive>("FakeQuantPerLayer");
 inline const PrimitivePtr kPrimFakeQuantPerChannel = std::make_shared<Primitive>("FakeQuantPerChannel");
+inline const PrimitivePtr kPrimFakeLearnedScaleQuantPerLayer =
+  std::make_shared<Primitive>("FakeLearnedScaleQuantPerLayer");
+inline const PrimitivePtr kPrimFakeLearnedScaleQuantPerChannel =
+  std::make_shared<Primitive>("FakeLearnedScaleQuantPerChannel");
 inline const PrimitivePtr kPrimFakeQuantWithMinMaxVars = std::make_shared<Primitive>("FakeQuantWithMinMaxVars");
 inline const PrimitivePtr kPrimApplyRMSProp = std::make_shared<Primitive>("ApplyRMSProp");
 inline const PrimitivePtr kPrimSparseApplyFtrl = std::make_shared<Primitive>("SparseApplyFtrl");
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@ -30,6 +30,7 @@ import mindspore.context as context
 from .normalization import BatchNorm2d
 from .activation import get_activation, ReLU
 from ..cell import Cell
+from ... import nn
 from ...ops.operations import _quant_ops as Q

 __all__ = [
@ -215,6 +216,8 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
    r"""
    Quantization aware operation which provides the fake quantization observer function on data with min and max.

+    The detail of the quantization mode `DEFAULT` is described as below:
+
    The running min/max :math:`x_{min}` and :math:`x_{max}` are computed as:

    .. math::
@ -269,10 +272,59 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
            output = u_X * scale + u_{min}
        \end{array}

+    The detail of the quantization mode `LEARNED_SCALE` is described as below:
+
+    The fake quant output is computed as:
+
+    .. math::
+
+        \bar{X}=\left\{\begin{matrix}
+        clip\left ( \frac{X}{maxq},0,1\right ) \qquad \quad if\quad neg\_trunc\\
+        clip\left ( \frac{X}{maxq},-1,1\right )\qquad \ if\quad otherwise
+        \end{matrix}\right. \\
+
+        output=\frac{floor\left ( \bar{X}\ast  Q_{max}+0.5  \right ) \ast scale }{Q_{max}}
+
+    where X is the input tensor.
+    where :math:`Q_{max}` (quant_max) is decided by quant_dtype and neg_trunc, for example, if quant_dtype=INT8
+    and neg_trunc works, :math:`Q_{max} = 256` , otherwise math:`Q_{max} = 127`.
+
+    The maxq is updated by training, and its gradient is calculated as follows:
+
+    .. math::
+
+        \frac{\partial \ output}{\partial \ maxq} & = \left\{\begin{matrix}
+        -\frac{X}{maxq}+\left \lfloor \frac{X}{maxq} \right \rceil \qquad if\quad bound_{lower}< \frac{X}{maxq}< 1\\
+        -1 \qquad \quad \qquad \quad if\quad \frac{X}{maxq}\le bound_{lower}\\
+         1  \qquad \quad \qquad \quad if\quad \frac{X}{maxq}\ge  1 \qquad \quad
+        \end{matrix}\right. \\
+
+        bound_{lower}=
+        \end{align}\left\{\begin{matrix}
+         0\qquad \quad if\quad neg\_trunc\\
+        -1\qquad if\quad otherwise
+        \end{matrix}\right.
+
+    Then minq is computed as:
+
+    .. math::
+
+        minq=\left\{\begin{matrix}
+        0  \qquad \qquad \quad if\quad neg\_trunc\\
+        -maxq\qquad if\quad otherwise
+        \end{matrix}\right.
+
+    When exporting, the scale and zero point zp is computed as:
+
+    .. math::
+
+        scale=\frac{maxq}{quant\_max} ,\quad zp=0 \\
+
+    zp is equal to 0 consistently, due to the LEARNED_SCALE`s symmetric nature.

    Args:
-        min_init (int, float): The initialized min value. Default: -6.
-        max_init (int, float): The initialized max value. Default: 6.
+        min_init (int, float, list): The initialized min value. Default: -6.
+        max_init (int, float, list): The initialized max value. Default: 6.
        ema (bool): The exponential Moving Average algorithm updates min and max. Default: False.
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
@ -282,7 +334,9 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
        symmetric (bool): Whether the quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): Whether the quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global step. Default: 0.
-
+        neg_trunc (bool): Whether the quantization algorithm uses nagetive truncation or not. Default: False.
+        mode (string): Optional quantization mode, currently only `DEFAULT`(QAT) and `LEARNED_SCALE` are supported.
+            Default: ("DEFAULT")
    Inputs:
        - **input** (Tensor) - The input of FakeQuantWithMinMaxObserver.

@ -290,7 +344,7 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
        Tensor, with the same type and shape as the `input`.

    Raises:
-        TypeError: If `min_init` or `max_init` is neither int nor float.
+        TypeError: If `min_init` or `max_init` is not int, float or list.
        TypeError: If `quant_delay` is not an int.
        TypeError: If `min_init` is not less than `max_init`.
        TypeError: If `quant_delay` is not greater than or equal to 0.
@ -318,18 +372,24 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
                 quant_dtype=QuantDtype.INT8,
                 symmetric=False,
                 narrow_range=False,
-                 quant_delay=0):
+                 quant_delay=0,
+                 neg_trunc=False,
+                 mode="DEFAULT"):
        """Initialize FakeQuantWithMinMaxObserver"""
        super(FakeQuantWithMinMaxObserver, self).__init__(quant_dtype=quant_dtype, per_channel=per_channel,
                                                          symmetric=symmetric, narrow_range=narrow_range,
                                                          num_channels=num_channels)
-        Validator.check_value_type("min_init", min_init, [int, float], type(self).__name__)
-        Validator.check_value_type("max_init", max_init, [int, float], type(self).__name__)
-        Validator.check("min_init", min_init, "max_init", max_init, rel=Rel.LT)
+        Validator.check_value_type("min_init", min_init, [int, float, list], type(self).__name__)
+        Validator.check_value_type("max_init", max_init, [int, float, list], type(self).__name__)
+        if isinstance(max_init, (int, float)) and isinstance(min_init, (int, float)):
+            Validator.check("min_init", min_init, "max_init", max_init, rel=Rel.LT)
+        elif not np.greater(max_init, min_init).all():
+            raise ValueError("`min_init` is not less than `max_init`, please reset the initial value.")
        Validator.check_non_negative_int(quant_delay, 'quant_delay')
        self.min_init = min_init
        self.max_init = max_init
        self.quant_dtype = quant_dtype
+        self.num_bits = quant_dtype.num_bits
        self.ema = ema
        self.ema_decay = ema_decay
        self.per_channel = per_channel
@ -338,43 +398,124 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
        self.quant_delay = quant_delay
        self.symmetric = symmetric
        self.narrow_range = narrow_range
+        self.neg_trunc = neg_trunc
+        self.mode = mode
        self.is_ascend = context.get_context('device_target') == "Ascend"
+        self.Neg = P.Neg()

-        # init tensor min and max for fake quantized operation
-        if self.per_channel:
-            min_array = np.array([self.min_init] * self.num_channels).astype(np.float32)
-            max_array = np.array([self.max_init] * self.num_channels).astype(np.float32)
-        else:
-            min_array = np.array([self.min_init]).astype(np.float32)
-            max_array = np.array([self.max_init]).astype(np.float32)
-        self.minq = Parameter(Tensor(min_array), name='quant_min', requires_grad=False)
-        self.maxq = Parameter(Tensor(max_array), name='quant_max', requires_grad=False)
+        min_array = self._get_init_array(self.min_init)
+        max_array = self._get_init_array(self.max_init)

-        # init fake quant relative op
-        if self.per_channel:
-            quant_fun = partial(Q.FakeQuantPerChannel, channel_axis=self.channel_axis)
-            ema_fun = partial(Q.MinMaxUpdatePerChannel, channel_axis=self.channel_axis)
-        else:
-            quant_fun = Q.FakeQuantPerLayer
-            ema_fun = Q.MinMaxUpdatePerLayer
+        if self.mode == "DEFAULT":
+            # init tensor min and max for fake quantized operation
+            self.minq = Parameter(Tensor(min_array), name='quant_min', requires_grad=False)
+            self.maxq = Parameter(Tensor(max_array), name='quant_max', requires_grad=False)
+
+            # init fake quant relative op
+            if self.per_channel:
+                quant_fun = partial(Q.FakeQuantPerChannel, channel_axis=self.channel_axis)
+                ema_fun = partial(Q.MinMaxUpdatePerChannel, channel_axis=self.channel_axis)
+            else:
+                quant_fun = Q.FakeQuantPerLayer
+                ema_fun = Q.MinMaxUpdatePerLayer
+
+            self.ema_update = ema_fun(ema=self.ema, ema_decay=self.ema_decay)
+            if self.is_ascend:
+                self.fake_quant_train = quant_fun(num_bits=self.quant_dtype.num_bits,
+                                                  symmetric=self.symmetric,
+                                                  narrow_range=self.narrow_range,
+                                                  quant_delay=self.quant_delay)
+                self.fake_quant_infer = self.fake_quant_train
+            else:
+                quant_fun = partial(quant_fun,
+                                    ema=self.ema,
+                                    ema_decay=ema_decay,
+                                    num_bits=self.quant_dtype.num_bits,
+                                    symmetric=self.symmetric,
+                                    narrow_range=self.narrow_range,
+                                    quant_delay=self.quant_delay)
+                self.fake_quant_train = quant_fun(training=True)
+                self.fake_quant_infer = quant_fun(training=False)
+        elif self.mode == "LEARNED_SCALE":
+            if not self.symmetric:
+                raise ValueError("The 'LEARNED_SCALE' mode only support symmetric quant, please set symmetric to True.")
+            if self.neg_trunc:
+                min_array = self._get_init_array(0)
+                self.narrow_range = False
+            elif not self.narrow_range:
+                raise ValueError("The 'LEARNED_SCALE' mode only support narrow_range=True config, "
+                                 "except for neg_trunc=True scenario.")
+
+            self._calculate_quant_max()
+
+            self.minq = Parameter(Tensor(min_array), name='minq')
+            self.maxq = Parameter(Tensor(max_array), name='maxq')
+            self.quant_max = Parameter(Tensor(np.array([self._quant_max]).astype(np.float32)),
+                                       name="quant_max", requires_grad=False)
+
+            # init fake quant relative op
+            if self.per_channel:
+                quant_fun = partial(Q.FakeLearnedScaleQuantPerChannel, channel_axis=self.channel_axis)
+            else:
+                quant_fun = Q.FakeLearnedScaleQuantPerLayer

-        self.ema_update = ema_fun(ema=self.ema, ema_decay=self.ema_decay)
-        if self.is_ascend:
-            self.fake_quant_train = quant_fun(num_bits=self.quant_dtype.num_bits,
-                                              symmetric=self.symmetric,
-                                              narrow_range=self.narrow_range,
-                                              quant_delay=self.quant_delay)
-            self.fake_quant_infer = self.fake_quant_train
-        else:
            quant_fun = partial(quant_fun,
-                                ema=self.ema,
-                                ema_decay=ema_decay,
-                                num_bits=self.quant_dtype.num_bits,
-                                symmetric=self.symmetric,
-                                narrow_range=self.narrow_range,
-                                quant_delay=self.quant_delay)
+                                quant_delay=self.quant_delay,
+                                neg_trunc=self.neg_trunc)
            self.fake_quant_train = quant_fun(training=True)
            self.fake_quant_infer = quant_fun(training=False)
+        else:
+            raise ValueError("Invalid mode, currently only valid for `DEFAULT` and `LEARNED_SCALE` mode.")
+
+    def reset(self, quant_dtype=QuantDtype.INT8, min_init=-6, max_init=6):
+        r"""
+        Reset the quant max parameter (eg. 256) and the initial value of the minq parameter and maxq parameter,
+        this function is currently only valid for `LEARNED_SCALE` mode.
+        """
+        if self.mode == "LEARNED_SCALE":
+            self.quant_dtype = quant_dtype
+            self.num_bits = quant_dtype.num_bits
+            self._calculate_quant_max()
+            if self.neg_trunc:
+                min_init = 0
+
+            self.min_init = min_init
+            self.max_init = max_init
+            min_array = self._get_init_array(self.min_init)
+            max_array = self._get_init_array(self.max_init)
+            self.minq.set_data(Tensor(min_array))
+            self.maxq.set_data(Tensor(max_array))
+            self.quant_max.set_data(Tensor(np.array([self._quant_max]).astype(np.float32)))
+        else:
+            raise ValueError("The `reset` function is currently only valid for `LEARNED_SCALE` mode.")
+
+    def _get_init_array(self, init_date):
+        """
+        Convert the initial value to array.
+        """
+        if isinstance(init_date, list) and self.per_channel and len(init_date) != self.num_channels:
+            raise ValueError("The length of the min_init/max_init list shuold be equal to num_channels for "
+                             "perchannel quant scenario, but get {}".format(len(init_date)))
+        if isinstance(init_date, list) and not self.per_channel and len(init_date) != 1:
+            raise ValueError("The length of the min_init/max_init list shuold be 1 for perlayer quant "
+                             "scenario, but get {}".format(len(init_date)))
+
+        if isinstance(init_date, list):
+            min_max_array = np.array(init_date).astype(np.float32)
+        elif self.per_channel and not isinstance(init_date, list):
+            min_max_array = np.array([init_date] * self.num_channels).astype(np.float32)
+        else:
+            min_max_array = np.array([init_date]).astype(np.float32)
+        return min_max_array
+
+    def _calculate_quant_max(self):
+        """
+        The quantization range is calculated according to num_bits.
+        """
+        if not self.neg_trunc:
+            self._quant_max = (1 << (self.num_bits - 1)) - 1
+        else:
+            self._quant_max = (1 << self.num_bits) - 1

    def extend_repr(self):
        s = 'quant_dtype={}, symmetric={}, narrow_range={}, ema={}({}), per_channel={}({}, {}), ' \
@ -385,13 +526,21 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
        return s

    def construct(self, x):
-        if self.training:
-            min_up, max_up = self.ema_update(x, self.minq, self.maxq)
-            self.minq = min_up
-            self.maxq = max_up
-            out = self.fake_quant_train(x, self.minq, self.maxq)
+        if self.mode == "LEARNED_SCALE":
+            if self.training:
+                out = self.fake_quant_train(x, self.maxq, self.quant_max)
+                if not self.neg_trunc:
+                    self.minq = self.Neg(self.maxq)
+            else:
+                out = self.fake_quant_infer(x, self.maxq, self.quant_max)
        else:
-            out = self.fake_quant_infer(x, self.minq, self.maxq)
+            if self.training:
+                min_up, max_up = self.ema_update(x, self.minq, self.maxq)
+                self.minq = min_up
+                self.maxq = max_up
+                out = self.fake_quant_train(x, self.minq, self.maxq)
+            else:
+                out = self.fake_quant_infer(x, self.minq, self.maxq)
        return out


@ -539,12 +688,13 @@ class Conv2dBnFoldQuantOneConv(Cell):
                                         requires_grad=False)

        # initialize fake ops
-        self.fake_quant_weight = quant_config.weight(min_init=-6,
-                                                     max_init=6,
-                                                     ema=False,
+        self.fake_quant_weight = quant_config.weight(ema=False,
                                                     channel_axis=channel_axis,
                                                     num_channels=out_channels,
                                                     quant_dtype=quant_dtype)
+        self.freeze_bn = False
+        if self.fake_quant_weight.mode == "LEARNED_SCALE":
+            self.freeze_bn = True
        self.bn_train = P.BatchNorm(is_training=True, epsilon=self.eps,
                                    momentum=self.momentum, data_format=self.format)

@ -579,6 +729,9 @@ class Conv2dBnFoldQuantOneConv(Cell):
        if self.fake:
            weight = self.fake_quant_weight(weight)
        conv = self.conv(x, weight)
+
+        if self.freeze_bn:
+            return conv + self.reshape((self.beta - self.gamma * self.moving_mean / running_std), (1, -1, 1, 1))
        scale_factor = self.reshape(scale_factor, (1, -1, 1, 1))
        if self.enable_default_train:
            scale_factor = P.Reciprocal()(scale_factor)
@ -730,9 +883,7 @@ class Conv2dBnFoldQuant(Cell):
                                         requires_grad=False)

        # initialize fake ops
-        self.fake_quant_weight = quant_config.weight(min_init=-6,
-                                                     max_init=6,
-                                                     ema=False,
+        self.fake_quant_weight = quant_config.weight(ema=False,
                                                     channel_axis=channel_axis,
                                                     num_channels=out_channels,
                                                     quant_dtype=quant_dtype)
@ -886,9 +1037,7 @@ class Conv2dBnWithoutFoldQuant(Cell):
        weight_shape = [out_channels, in_channels // group, *self.kernel_size]
        channel_axis = 0
        self.weight = Parameter(initializer(weight_init, weight_shape), name='weight')
-        self.fake_quant_weight = quant_config.weight(min_init=-6,
-                                                     max_init=6,
-                                                     ema=False,
+        self.fake_quant_weight = quant_config.weight(ema=False,
                                                     channel_axis=channel_axis,
                                                     num_channels=out_channels,
                                                     quant_dtype=quant_dtype)
@ -1005,9 +1154,7 @@ class Conv2dQuant(Cell):
                             dilation=self.dilation,
                             group=self.group)
        channel_axis = 0
-        self.fake_quant_weight = quant_config.weight(min_init=-6,
-                                                     max_init=6,
-                                                     ema=False,
+        self.fake_quant_weight = quant_config.weight(ema=False,
                                                     channel_axis=channel_axis,
                                                     num_channels=out_channels,
                                                     quant_dtype=quant_dtype)
@ -1111,9 +1258,7 @@ class DenseQuant(Cell):
        if activation is not None and not isinstance(self.activation, (Cell, Primitive)):
            raise TypeError("The activation must be str or Cell or Primitive,"" but got {}.".format(activation))
        self.activation_flag = self.activation is not None
-        self.fake_quant_weight = quant_config.weight(min_init=-6,
-                                                     max_init=6,
-                                                     ema=False,
+        self.fake_quant_weight = quant_config.weight(ema=False,
                                                     channel_axis=0,
                                                     num_channels=out_channels,
                                                     quant_dtype=quant_dtype)
@ -1198,6 +1343,8 @@ class ActQuant(_QuantActivation):
                 quant_config=quant_config_default,
                 quant_dtype=QuantDtype.INT8):
        super(ActQuant, self).__init__()
+        act_class = activation.__class__
+        act_list = [nn.ReLU, nn.ReLU6]
        self.act = Validator.check_isinstance("activation", activation, Cell)
        self.fake_before = Validator.check_bool(fake_before, "fake_before")
        if self.fake_before:
@ -1206,11 +1353,14 @@ class ActQuant(_QuantActivation):
                                                                 ema=ema,
                                                                 ema_decay=ema_decay,
                                                                 quant_dtype=quant_dtype)
+
+        neg_trunc = bool(act_class in act_list)
        self.fake_quant_act = quant_config.activation(min_init=-6,
                                                      max_init=6,
                                                      ema=ema,
                                                      ema_decay=ema_decay,
-                                                      quant_dtype=quant_dtype)
+                                                      quant_dtype=quant_dtype,
+                                                      neg_trunc=neg_trunc)

    def construct(self, x):
        if self.fake_before:
--- a/mindspore/ops/_grad/grad_quant_ops.py
+++ b/mindspore/ops/_grad/grad_quant_ops.py
@ -200,3 +200,30 @@ def get_bprop_wts_arq(self):
        return (dout, zeros_like(w_min), zeros_like(w_max))

    return bprop
+
+
+@bprop_getters.register(Q.FakeLearnedScaleQuantPerLayer)
+def get_bprop_fakequant_with_learned_scale_perlayer(self):
+    """Generate bprop for FakeLearnedScaleQuantPerLayer for GPU"""
+    op = Q.FakeLearnedScaleQuantPerLayerGrad(quant_delay=self.quant_delay,
+                                             neg_trunc=self.neg_trunc)
+
+    def bprop(x, x_alpha, x_quant_max, out, dout):
+        dx, dalpha = op(dout, x, x_alpha, x_quant_max)
+        return dx, dalpha, zeros_like(x_quant_max)
+
+    return bprop
+
+
+@bprop_getters.register(Q.FakeLearnedScaleQuantPerChannel)
+def get_bprop_fakequant_with_learned_scale_perchannel(self):
+    """Generate bprop for FakeLearnedScaleQuantPerChannel for GPU"""
+    op = Q.FakeLearnedScaleQuantPerChannelGrad(quant_delay=self.quant_delay,
+                                               neg_trunc=self.neg_trunc,
+                                               channel_axis=self.channel_axis)
+
+    def bprop(x, x_alpha, x_quant_max, out, dout):
+        dx, dalpha = op(dout, x, x_alpha, x_quant_max)
+        return dx, dalpha, zeros_like(x_quant_max)
+
+    return bprop
--- a/mindspore/ops/_op_impl/_custom_op/init.py
+++ b/mindspore/ops/_op_impl/_custom_op/init.py
@ -14,3 +14,9 @@
 # ============================================================================

 """custom ops"""
+from .fake_learned_scale_quant_perlayer import _fake_learned_scale_quant_perlayer_tbe
+from .fake_learned_scale_quant_perlayer_grad import _fake_learned_scale_quant_perlayer_grad_d_tbe
+from .fake_learned_scale_quant_perlayer_grad_reduce import _fake_learned_scale_quant_perlayer_grad_d_reduce_tbe
+from .fake_learned_scale_quant_perchannel import _fake_learned_scale_quant_perchannel_tbe
+from .fake_learned_scale_quant_perchannel_grad import _fake_learned_scale_quant_perchannel_grad_d_tbe
+from .fake_learned_scale_quant_perchannel_grad_reduce import _fake_learned_scale_quant_perchannel_grad_d_reduce_tbe
--- a/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel.py
+++ b/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel.py
@ -0,0 +1,125 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FakeLearnedScaleQuantPerChannel op"""
+import te.lang.cce
+from te import tvm
+from te.platform.fusion_manager import fusion_manager
+from topi import generic
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+fake_learned_scale_quant_perchannel_op_info = TBERegOp("FakeLearnedScaleQuantPerChannel") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("fake_learned_scale_quant_perchannel.so") \
+    .compute_cost(10) \
+    .kernel_name("fake_learned_scale_quant_perchannel") \
+    .partial_flag(True) \
+    .attr("neg_trunc", "optional", "bool", "all") \
+    .attr("channel_axis", "optional", "int", "all") \
+    .input(0, "input_x", None, "required", None) \
+    .input(1, "alpha", None, "required", None) \
+    .input(2, "quant_max", None, "required", None) \
+    .output(0, "out", True, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(fake_learned_scale_quant_perchannel_op_info)
+def _fake_learned_scale_quant_perchannel_tbe():
+    """FakeLearnedScaleQuantPerChannel TBE register"""
+    return
+
+
+@fusion_manager.register("fake_learned_scale_quant_perchannel")
+def fake_learned_scale_quant_perchannel_compute(input_data, alpha_data, quant_max_data, neg_trunc,
+                                                kernel_name="fake_learned_scale_quant_perchannel"):
+    """FakeLearnedScaleQuantPerChannel"""
+    input_shape = te.lang.cce.util.shape_to_list(input_data.shape)
+    alpha_data = te.lang.cce.broadcast(alpha_data, input_shape, input_data.dtype)
+    quant_max_data = te.lang.cce.broadcast(quant_max_data, input_shape, input_data.dtype)
+
+    input_x = te.lang.cce.vdiv(input_data, alpha_data)
+
+    if neg_trunc:
+        input_x = te.lang.cce.round_to(input_x, 1.0, 0.0)
+    else:
+        input_x = te.lang.cce.round_to(input_x, 1.0, -1.0)
+
+    nudge_input = te.lang.cce.floor(te.lang.cce.vadds(te.lang.cce.vmul(input_x, quant_max_data), 0.5))
+    input_quant = te.lang.cce.vdiv(nudge_input, quant_max_data)
+    res = te.lang.cce.vmul(input_quant, alpha_data)
+
+    return res
+
+
+@util.check_input_type(dict, dict, dict, dict, bool, int, str)
+def fake_learned_scale_quant_perchannel(input_x, alpha, quant_max, out, neg_trunc, channel_axis,
+                                        kernel_name="fake_learned_scale_quant_perchannel"):
+    """FakeLearnedScaleQuantPerChannel"""
+    input_shape = input_x.get("shape")
+    input_x_shape_ = input_x.get("ori_shape")
+    input_x_format = input_x.get("format")
+    input_dtype = input_x.get("dtype")
+    alpha_shape = alpha.get("ori_shape")
+    alpha_dtype = alpha.get("dtype")
+    quant_max_shape = quant_max.get("ori_shape")
+    quant_max_dtype = quant_max.get("dtype")
+    # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1.
+    if channel_axis == 0 and input_x_shape_[0] != alpha_shape[0] and input_x_shape_[1] == alpha_shape[0]:
+        channel_axis_ = 1
+    else:
+        channel_axis_ = channel_axis
+
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(input_shape)
+    util.check_shape_rule(alpha_shape, 1, 1, input_x_shape_[channel_axis_])
+    util.check_shape_rule(quant_max_shape, 1, 1, 1)
+    util.check_tensor_shape_size(input_shape)
+    util.check_tensor_shape_size(alpha_shape)
+    util.check_tensor_shape_size(quant_max_shape)
+
+    check_list = ["float32", "float16"]
+    input_dtype = input_dtype.lower()
+    alpha_dtype = alpha_dtype.lower()
+    quant_max_dtype = quant_max_dtype.lower()
+    util.check_dtype_rule(input_dtype, check_list)
+    util.check_dtype_rule(alpha_dtype, check_list)
+    util.check_dtype_rule(quant_max_dtype, check_list)
+
+    shape_c = [1] * len(input_shape)
+    shape_c[channel_axis_] = alpha.get("ori_shape")[0]
+    if input_x_format == "NC1HWC0" and channel_axis_ == 1:
+        shape_c = alpha.get("shape")
+
+    input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
+    alpha_data = tvm.placeholder(shape_c, name="alpha_data", dtype=alpha_dtype)
+    quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype)
+    res = fake_learned_scale_quant_perchannel_compute(input_data, alpha_data, quant_max_data, neg_trunc, kernel_name)
+
+    with tvm.target.cce():
+        sch = generic.auto_schedule(res)
+
+    tensor_list = [input_data, alpha_data, quant_max_data, res]
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list,
+              "bool_storage_as_1bit": False}
+
+    te.lang.cce.cce_build_code(sch, config)
--- a/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad.py
+++ b/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad.py
@ -0,0 +1,191 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FakeLearnedScaleQuantPerChannelGradD op"""
+import te.lang.cce
+from te import tvm
+from te.platform.fusion_manager import fusion_manager
+from topi import generic
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+NEG_SCALAR_MIN_FP16 = -(2 ** (-24))
+NEG_SCALAR_MIN_FP32 = -(2 ** (-126))
+SCALAR_MIN_FP16 = 2 ** (-24)
+SCALAR_MIN_FP32 = 2 ** (-126)
+
+fake_learned_scale_quant_perchannel_grad_d_op_info = TBERegOp("FakeLearnedScaleQuantPerChannelGradD") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("fake_learned_scale_quant_perchannel_grad_d.so") \
+    .compute_cost(10) \
+    .kernel_name("fake_learned_scale_quant_perchannel_grad_d") \
+    .partial_flag(True) \
+    .attr("neg_trunc", "optional", "bool", "all") \
+    .attr("channel_axis", "optional", "int", "all") \
+    .input(0, "dout", None, "required", None) \
+    .input(1, "input_x", None, "required", None) \
+    .input(2, "alpha", None, "required", None) \
+    .input(3, "quant_max", None, "required", None) \
+    .output(0, "dx", True, "required", "all") \
+    .output(1, "dalpha", True, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD,
+                  DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(fake_learned_scale_quant_perchannel_grad_d_op_info)
+def _fake_learned_scale_quant_perchannel_grad_d_tbe():
+    """FakeLearnedScaleQuantPerChannelGradD TBE register"""
+    return
+
+
+@fusion_manager.register("fake_learned_scale_quant_perchannel_grad_d")
+def fake_learned_scale_quant_perchannel_grad_d_compute(dout, input_data, alpha_data, quant_max_data, neg_trunc,
+                                                       kernel_name="fake_learned_scale_quant_perchannel_grad_d"):
+    """FakeLearnedScaleQuantPerChannelGradD"""
+    input_shape = te.lang.cce.util.shape_to_list(input_data.shape)
+    alpha_data = te.lang.cce.broadcast(alpha_data, input_shape, input_data.dtype)
+    quant_max_data = te.lang.cce.broadcast(quant_max_data, input_shape, input_data.dtype)
+
+    input_x = te.lang.cce.vdiv(input_data, alpha_data)
+    input_div_alpha = input_x
+
+    if neg_trunc:
+        input_x = te.lang.cce.round_to(input_x, 1.0, 0.0)
+    else:
+        input_x = te.lang.cce.round_to(input_x, 1.0, -1.0)
+
+    nudge_input = te.lang.cce.floor(te.lang.cce.vadds(te.lang.cce.vmul(input_x, quant_max_data), 0.5))
+    input_quant = te.lang.cce.vdiv(nudge_input, quant_max_data)
+
+    dtype = input_div_alpha.dtype.lower()
+    shape = te.lang.cce.util.shape_to_list(input_div_alpha.shape)
+
+    dx = dout
+    tensor_one = tvm.const(1.0, input_div_alpha.dtype)
+    tensor_one = te.lang.cce.broadcast(tensor_one, shape)
+
+    #out_of_bounds = te.lang.cce.vcmpsel(te.lang.cce.vabs(input_div_alpha), 1.0, 'gt', 1.0, 0.0)
+    out_of_upper_bounds = te.lang.cce.vcmpsel(input_div_alpha, 1.0, 'gt', 1.0, 0.0)
+    if neg_trunc:
+        out_of_lower_bounds = te.lang.cce.vcmpsel(input_div_alpha, 0.0, 'lt', 1.0, 0.0)
+    else:
+        out_of_lower_bounds = te.lang.cce.vcmpsel(input_div_alpha, -1.0, 'lt', 1.0, 0.0)
+    out_of_bounds = te.lang.cce.vadd(out_of_lower_bounds, out_of_upper_bounds)
+
+    dx = te.lang.cce.vmul(dx, te.lang.cce.vsub(tensor_one, out_of_bounds))
+
+    # sign function imp
+    if dtype == "float32":
+        data_min = tvm.const(SCALAR_MIN_FP32, dtype=dtype)
+        neg_data_min = tvm.const(NEG_SCALAR_MIN_FP32, dtype=dtype)
+    elif dtype == "float16":
+        data_min = tvm.const(SCALAR_MIN_FP16, dtype=dtype)
+        neg_data_min = tvm.const(NEG_SCALAR_MIN_FP16, dtype=dtype)
+    else:
+        data_min = tvm.const(1, dtype=dtype)
+        neg_data_min = tvm.const(-1, dtype=dtype)
+    vmax = te.lang.cce.vmaxs(input_div_alpha, neg_data_min)
+    vmin = te.lang.cce.vmins(vmax, data_min)
+    if dtype == "float32":
+        # max num of float32 is 2**126
+        max_support_fp32 = tvm.const(2 ** 62, dtype=dtype)
+        res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp32)
+        res_mul2 = te.lang.cce.vmuls(res_mul1, max_support_fp32)
+        sign = te.lang.cce.vmuls(res_mul2, tvm.const(2 ** 2, dtype=dtype))
+    elif dtype == "float16":
+        # max num of float16 is 2**24
+        # but cce can only support 2**12, so use 12/12 to adaptor 24
+        max_support_fp16 = tvm.const(2 ** 12, dtype=dtype)
+        res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp16)
+        sign = te.lang.cce.vmuls(res_mul1, max_support_fp16)
+    else:
+        sign = vmin
+
+    # The following lines are equivalent to :
+    #     dalpha_each = dout * sign                             if out of bounds
+    #                   dout * (input_quant - input_div_alpha)  if within bounds
+
+    quant_error = te.lang.cce.vsub(input_quant, input_div_alpha)
+    within_bounds = te.lang.cce.vsub(tensor_one, out_of_bounds)
+    error_within_bounds = te.lang.cce.vmul(quant_error, within_bounds)
+    grad_range = te.lang.cce.vmadd(sign, error_within_bounds, out_of_bounds)
+    dalpha_each = te.lang.cce.vmul(dout, grad_range)
+
+    return [dx, dalpha_each]
+
+
+@util.check_input_type(dict, dict, dict, dict, dict, dict, bool, int, str)
+def fake_learned_scale_quant_perchannel_grad_d(dout, input_x, alpha, quant_max, dx, dalpha, neg_trunc,
+                                               channel_axis, kernel_name="fake_learned_scale_quant_perchannel_grad_d"):
+    """FakeLearnedScaleQuantPerChannelGradD"""
+    input_shape = input_x.get("shape")
+    input_x_shape_ = input_x.get("ori_shape")
+    input_x_format = input_x.get("format")
+    input_dtype = input_x.get("dtype")
+    alpha_shape = alpha.get("ori_shape")
+    alpha_dtype = alpha.get("dtype")
+    quant_max_shape = quant_max.get("ori_shape")
+    quant_max_dtype = quant_max.get("dtype")
+    # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1.
+    if channel_axis == 0 and input_x_shape_[0] != alpha_shape[0] and input_x_shape_[1] == alpha_shape[0]:
+        channel_axis_ = 1
+    else:
+        channel_axis_ = channel_axis
+
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(input_shape)
+    util.check_shape_rule(alpha_shape, 1, 1, input_x_shape_[channel_axis_])
+    util.check_shape_rule(quant_max_shape, 1, 1, 1)
+    util.check_tensor_shape_size(input_shape)
+    util.check_tensor_shape_size(alpha_shape)
+    util.check_tensor_shape_size(quant_max_shape)
+
+    check_list = ["float32", "float16"]
+    input_dtype = input_dtype.lower()
+    alpha_dtype = alpha_dtype.lower()
+    quant_max_dtype = quant_max_dtype.lower()
+    util.check_dtype_rule(input_dtype, check_list)
+    util.check_dtype_rule(alpha_dtype, check_list)
+    util.check_dtype_rule(quant_max_dtype, check_list)
+
+    shape_c = [1] * len(input_shape)
+    shape_c[channel_axis_] = alpha.get("ori_shape")[0]
+    if input_x_format == "NC1HWC0" and channel_axis_ == 1:
+        shape_c = alpha.get("shape")
+
+    dout_data = tvm.placeholder(input_shape, name="dout", dtype=input_dtype)
+    input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
+    alpha_data = tvm.placeholder(shape_c, name="alpha_data", dtype=alpha_dtype)
+    quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype)
+    res = fake_learned_scale_quant_perchannel_grad_d_compute(dout_data, input_data, alpha_data, quant_max_data,
+                                                             neg_trunc, kernel_name)
+
+    with tvm.target.cce():
+        sch = generic.auto_schedule(res)
+
+    tensor_list = [dout_data, input_data, alpha_data, quant_max_data] + list(res)
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list}
+
+    te.lang.cce.cce_build_code(sch, config)
--- a/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad_reduce.py
+++ b/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad_reduce.py
@ -0,0 +1,88 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FakeLearnedScaleQuantPerChannelGradDReduce op"""
+import te.lang.cce
+from te import tvm
+from te.platform.fusion_manager import fusion_manager
+from topi import generic
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+
+fake_learned_scale_quant_perchannel_grad_d_reduce_op_info = TBERegOp("FakeLearnedScaleQuantPerChannelGradDReduce") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("fake_learned_scale_quant_perchannel_grad_d_reduce.so") \
+    .compute_cost(10) \
+    .kernel_name("fake_learned_scale_quant_perchannel_grad_d_reduce") \
+    .partial_flag(True) \
+    .attr("channel_axis", "optional", "int", "all") \
+    .input(0, "dout_alpha", None, "required", None) \
+    .output(0, "dalpha", True, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(fake_learned_scale_quant_perchannel_grad_d_reduce_op_info)
+def _fake_learned_scale_quant_perchannel_grad_d_reduce_tbe():
+    """FakeLearnedScaleQuantPerChannelGradDReduce TBE register"""
+    return
+
+
+@fusion_manager.register("fake_learned_scale_quant_perchannel_grad_d_reduce")
+def fake_learned_scale_quant_perchannel_grad_d_reduce_compute(dout_alpha_data, dout_alpha, channel_axis,
+                                                              kernel_name="fake_learned_scale_quant_perchannel_"
+                                                                          "grad_d_reduce"):
+    """FakeLearnedScaleQuantPerChannelGradDReduce"""
+    dout_alpha_shape = dout_alpha.get("shape")
+    axis = list(range(len(dout_alpha_shape)))
+    axis.remove(channel_axis)
+    dalpha = te.lang.cce.sum(dout_alpha_data, axis, False)
+    return dalpha
+
+
+@util.check_input_type(dict, dict, int, str)
+def fake_learned_scale_quant_perchannel_grad_d_reduce(dout_alpha, dalpha, channel_axis,
+                                                      kernel_name="fake_learned_scale_quant_perchannel_grad_d_reduce"):
+    """FakeLearnedScaleQuantPerChannelGradDReduce"""
+
+    dout_alpha_shape = dout_alpha.get("shape")
+    dout_alpha_dtype = dout_alpha.get("dtype")
+
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(dout_alpha_shape)
+    util.check_tensor_shape_size(dout_alpha_shape)
+
+    check_list = ["float32", 'float16']
+    dout_alpha_dtype = dout_alpha_dtype.lower()
+    util.check_dtype_rule(dout_alpha_dtype, check_list)
+
+    dout_alpha_data = tvm.placeholder(dout_alpha_shape, name="dout_alpha", dtype=dout_alpha_dtype)
+    res = fake_learned_scale_quant_perchannel_grad_d_reduce_compute(dout_alpha_data, dout_alpha,
+                                                                    channel_axis, kernel_name)
+
+    with tvm.target.cce():
+        sch = generic.auto_schedule(res)
+
+    tensor_list = [dout_alpha_data, res]
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list}
+
+    te.lang.cce.cce_build_code(sch, config)
--- a/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer.py
+++ b/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer.py
@ -0,0 +1,117 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FakeLearnedScaleQuantPerLayer op"""
+from functools import reduce as functools_reduce
+import te.lang.cce
+from te import tvm
+from te.platform.fusion_manager import fusion_manager
+from topi import generic
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+fake_learned_scale_quant_perlayer_op_info = TBERegOp("FakeLearnedScaleQuantPerLayer") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("fake_learned_scale_quant_perlayer.so") \
+    .compute_cost(10) \
+    .kernel_name("fake_learned_scale_quant_perlayer") \
+    .partial_flag(True) \
+    .attr("neg_trunc", "optional", "bool", "all") \
+    .input(0, "input_x", None, "required", None) \
+    .input(1, "alpha", None, "required", None) \
+    .input(2, "quant_max", None, "required", None) \
+    .output(0, "out", True, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(fake_learned_scale_quant_perlayer_op_info)
+def _fake_learned_scale_quant_perlayer_tbe():
+    """FakeLearnedScaleQuantPerLayer TBE register"""
+    return
+
+
+@fusion_manager.register("fake_learned_scale_quant_perlayer")
+def fake_learned_scale_quant_perlayer_compute(input_data, alpha_data, quant_max_data, neg_trunc,
+                                              kernel_name="fake_learned_scale_quant_perlayer"):
+    """FakeLearnedScaleQuantPerLayer"""
+    input_shape = te.lang.cce.util.shape_to_list(input_data.shape)
+    alpha_data = te.lang.cce.broadcast(alpha_data, input_shape, input_data.dtype)
+    quant_max_data = te.lang.cce.broadcast(quant_max_data, input_shape, input_data.dtype)
+
+    input_x = te.lang.cce.vdiv(input_data, alpha_data)
+
+    if neg_trunc:
+        input_x = te.lang.cce.round_to(input_x, 1.0, 0.0)
+    else:
+        input_x = te.lang.cce.round_to(input_x, 1.0, -1.0)
+
+    nudge_input = te.lang.cce.floor(te.lang.cce.vadds(te.lang.cce.vmul(input_x, quant_max_data), 0.5))
+    input_quant = te.lang.cce.vdiv(nudge_input, quant_max_data)
+    res = te.lang.cce.vmul(input_quant, alpha_data)
+
+    return res
+
+
+@util.check_input_type(dict, dict, dict, dict, bool, str)
+def fake_learned_scale_quant_perlayer(input_x, alpha, quant_max, out, neg_trunc,
+                                      kernel_name="fake_learned_scale_quant_perlayer"):
+    """FakeLearnedScaleQuantPerLayer"""
+    input_shape = input_x.get("shape")
+    input_dtype = input_x.get("dtype")
+    alpha_shape = alpha.get("ori_shape")
+    alpha_dtype = alpha.get("dtype")
+    quant_max_shape = quant_max.get("ori_shape")
+    quant_max_dtype = quant_max.get("dtype")
+
+    alpha_shape = util.scalar2tensor_one(alpha_shape)
+    quant_max_shape = util.scalar2tensor_one(quant_max_shape)
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(input_shape)
+    util.check_shape_rule(alpha_shape, 1, 1, 1)
+    util.check_shape_rule(quant_max_shape, 1, 1, 1)
+    util.check_tensor_shape_size(input_shape)
+    util.check_tensor_shape_size(alpha_shape)
+    util.check_tensor_shape_size(quant_max_shape)
+
+    check_list = ["float32", "float16"]
+    input_dtype = input_dtype.lower()
+    alpha_dtype = alpha_dtype.lower()
+    quant_max_dtype = quant_max_dtype.lower()
+    util.check_dtype_rule(input_dtype, check_list)
+    util.check_dtype_rule(alpha_dtype, check_list)
+    util.check_dtype_rule(quant_max_dtype, check_list)
+
+    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),)
+
+    input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
+    alpha_data = tvm.placeholder(alpha_shape, name="alpha_data", dtype=alpha_dtype)
+    quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype)
+    res = fake_learned_scale_quant_perlayer_compute(input_data, alpha_data, quant_max_data, neg_trunc, kernel_name)
+
+    with tvm.target.cce():
+        sch = generic.auto_schedule(res)
+
+    tensor_list = [input_data, alpha_data, quant_max_data, res]
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list,
+              "bool_storage_as_1bit": False}
+
+    te.lang.cce.cce_build_code(sch, config)
--- a/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad.py
+++ b/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad.py
@ -0,0 +1,184 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FakeLearnedScaleQuantPerLayerGradD op"""
+
+from functools import reduce as functools_reduce
+import te.lang.cce
+from te import tvm
+from te.platform.fusion_manager import fusion_manager
+from topi import generic
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+NEG_SCALAR_MIN_FP16 = -(2 ** (-24))
+NEG_SCALAR_MIN_FP32 = -(2 ** (-126))
+SCALAR_MIN_FP16 = 2 ** (-24)
+SCALAR_MIN_FP32 = 2 ** (-126)
+
+fake_learned_scale_quant_perlayer_grad_d_op_info = TBERegOp("FakeLearnedScaleQuantPerLayerGradD") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("fake_learned_scale_quant_perlayer_grad_d.so") \
+    .compute_cost(10) \
+    .kernel_name("fake_learned_scale_quant_perlayer_grad_d") \
+    .partial_flag(True) \
+    .attr("neg_trunc", "optional", "bool", "all") \
+    .input(0, "dout", None, "required", None) \
+    .input(1, "input_x", None, "required", None) \
+    .input(2, "alpha", None, "required", None) \
+    .input(3, "quant_max", None, "required", None) \
+    .output(0, "dx", True, "required", "all") \
+    .output(1, "dalpha", True, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD,
+                  DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(fake_learned_scale_quant_perlayer_grad_d_op_info)
+def _fake_learned_scale_quant_perlayer_grad_d_tbe():
+    """FakeLearnedScaleQuantPerLayerGradD TBE register"""
+    return
+
+
+@fusion_manager.register("fake_learned_scale_quant_perlayer_grad_d")
+def fake_learned_scale_quant_perlayer_grad_d_compute(dout, input_data, alpha_data, quant_max_data, neg_trunc,
+                                                     kernel_name="fake_learned_scale_quant_perlayer_grad_d"):
+    """FakeLearnedScaleQuantPerLayerGradD"""
+    input_shape = te.lang.cce.util.shape_to_list(input_data.shape)
+    alpha_data = te.lang.cce.broadcast(alpha_data, input_shape, input_data.dtype)
+    quant_max_data = te.lang.cce.broadcast(quant_max_data, input_shape, input_data.dtype)
+
+    input_x = te.lang.cce.vdiv(input_data, alpha_data)
+    input_div_alpha = input_x
+
+    if neg_trunc:
+        input_x = te.lang.cce.round_to(input_x, 1.0, 0.0)
+    else:
+        input_x = te.lang.cce.round_to(input_x, 1.0, -1.0)
+
+    nudge_input = te.lang.cce.floor(te.lang.cce.vadds(te.lang.cce.vmul(input_x, quant_max_data), 0.5))
+    input_quant = te.lang.cce.vdiv(nudge_input, quant_max_data)
+
+    dtype = input_div_alpha.dtype.lower()
+    shape = te.lang.cce.util.shape_to_list(input_div_alpha.shape)
+
+    dx = dout
+    tensor_one = tvm.const(1.0, input_div_alpha.dtype)
+    tensor_one = te.lang.cce.broadcast(tensor_one, shape)
+
+    #out_of_bounds = te.lang.cce.vcmpsel(te.lang.cce.vabs(input_div_alpha), 1.0, 'gt', 1.0, 0.0)
+    out_of_upper_bounds = te.lang.cce.vcmpsel(input_div_alpha, 1.0, 'gt', 1.0, 0.0)
+    if neg_trunc:
+        out_of_lower_bounds = te.lang.cce.vcmpsel(input_div_alpha, 0.0, 'lt', 1.0, 0.0)
+    else:
+        out_of_lower_bounds = te.lang.cce.vcmpsel(input_div_alpha, -1.0, 'lt', 1.0, 0.0)
+    out_of_bounds = te.lang.cce.vadd(out_of_lower_bounds, out_of_upper_bounds)
+
+    dx = te.lang.cce.vmul(dx, te.lang.cce.vsub(tensor_one, out_of_bounds))
+
+    # sign function imp
+    if dtype == "float32":
+        data_min = tvm.const(SCALAR_MIN_FP32, dtype=dtype)
+        neg_data_min = tvm.const(NEG_SCALAR_MIN_FP32, dtype=dtype)
+    elif dtype == "float16":
+        data_min = tvm.const(SCALAR_MIN_FP16, dtype=dtype)
+        neg_data_min = tvm.const(NEG_SCALAR_MIN_FP16, dtype=dtype)
+    else:
+        data_min = tvm.const(1, dtype=dtype)
+        neg_data_min = tvm.const(-1, dtype=dtype)
+    vmax = te.lang.cce.vmaxs(input_div_alpha, neg_data_min)
+    vmin = te.lang.cce.vmins(vmax, data_min)
+    if dtype == "float32":
+        # max num of float32 is 2**126
+        max_support_fp32 = tvm.const(2 ** 62, dtype=dtype)
+        res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp32)
+        res_mul2 = te.lang.cce.vmuls(res_mul1, max_support_fp32)
+        sign = te.lang.cce.vmuls(res_mul2, tvm.const(2 ** 2, dtype=dtype))
+    elif dtype == "float16":
+        # max num of float16 is 2**24
+        # but cce can only support 2**12, so use 12/12 to adaptor 24
+        max_support_fp16 = tvm.const(2 ** 12, dtype=dtype)
+        res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp16)
+        sign = te.lang.cce.vmuls(res_mul1, max_support_fp16)
+    else:
+        sign = vmin
+
+    # The following lines are equivalent to :
+    #     dalpha_each = dout * sign                             if out of bounds
+    #                   dout * (input_quant - input_div_alpha)  if within bounds
+
+    quant_error = te.lang.cce.vsub(input_quant, input_div_alpha)
+    within_bounds = te.lang.cce.vsub(tensor_one, out_of_bounds)
+    error_within_bounds = te.lang.cce.vmul(quant_error, within_bounds)
+    grad_range = te.lang.cce.vmadd(sign, error_within_bounds, out_of_bounds)
+    dalpha_each = te.lang.cce.vmul(dout, grad_range)
+
+    return [dx, dalpha_each]
+
+
+@util.check_input_type(dict, dict, dict, dict, dict, dict, bool, str)
+def fake_learned_scale_quant_perlayer_grad_d(dout, input_x, alpha, quant_max, dx, dalpha, neg_trunc,
+                                             kernel_name="fake_learned_scale_quant_perlayer_grad_d"):
+    """FakeLearnedScaleQuantPerLayerGradD"""
+    input_shape = input_x.get("shape")
+    input_dtype = input_x.get("dtype")
+    alpha_shape = alpha.get("ori_shape")
+    alpha_dtype = alpha.get("dtype")
+    quant_max_shape = quant_max.get("ori_shape")
+    quant_max_dtype = quant_max.get("dtype")
+
+    alpha_shape = util.scalar2tensor_one(alpha_shape)
+    quant_max_shape = util.scalar2tensor_one(quant_max_shape)
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(input_shape)
+    util.check_shape_rule(alpha_shape, 1, 1, 1)
+    util.check_shape_rule(quant_max_shape, 1, 1, 1)
+    util.check_tensor_shape_size(input_shape)
+    util.check_tensor_shape_size(alpha_shape)
+    util.check_tensor_shape_size(quant_max_shape)
+
+    check_list = ["float32", "float16"]
+    input_dtype = input_dtype.lower()
+    alpha_dtype = alpha_dtype.lower()
+    quant_max_dtype = quant_max_dtype.lower()
+    util.check_dtype_rule(input_dtype, check_list)
+    util.check_dtype_rule(alpha_dtype, check_list)
+    util.check_dtype_rule(quant_max_dtype, check_list)
+
+    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),)
+
+    dout_data = tvm.placeholder(input_shape, name="dout", dtype=input_dtype)
+    input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
+    alpha_data = tvm.placeholder(alpha_shape, name="alpha_data", dtype=alpha_dtype)
+    quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype)
+    res = fake_learned_scale_quant_perlayer_grad_d_compute(dout_data, input_data, alpha_data, quant_max_data,
+                                                           neg_trunc, kernel_name)
+
+    with tvm.target.cce():
+        sch = generic.auto_schedule(res)
+
+    tensor_list = [dout_data, input_data, alpha_data, quant_max_data] + list(res)
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list}
+
+    te.lang.cce.cce_build_code(sch, config)
--- a/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad_reduce.py
+++ b/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad_reduce.py
@ -0,0 +1,88 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FakeLearnedScaleQuantPerLayerGradDReduce op"""
+
+from functools import reduce as functools_reduce
+import te.lang.cce
+from te import tvm
+from te.platform.fusion_manager import fusion_manager
+from topi import generic
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+
+fake_learned_scale_quant_perlayer_grad_d_reduce_op_info = TBERegOp("FakeLearnedScaleQuantPerLayerGradDReduce") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("fake_learned_scale_quant_perlayer_grad_d_reduce.so") \
+    .compute_cost(10) \
+    .kernel_name("fake_learned_scale_quant_perlayer_grad_d_reduce") \
+    .partial_flag(True) \
+    .input(0, "dout_alpha", None, "required", None) \
+    .output(0, "dalpha", True, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(fake_learned_scale_quant_perlayer_grad_d_reduce_op_info)
+def _fake_learned_scale_quant_perlayer_grad_d_reduce_tbe():
+    """FakeLearnedScaleQuantPerLayerGradDReduce TBE register"""
+    return
+
+
+@fusion_manager.register("fake_learned_scale_quant_perlayer_grad_d_reduce")
+def fake_learned_scale_quant_perlayer_grad_d_reduce_compute(dout_alpha,
+                                                            kernel_name="fake_learned_scale_quant_perlayer_"
+                                                                        "grad_d_reduce"):
+    """FakeLearnedScaleQuantPerLayerGradDReduce"""
+    dalpha = te.lang.cce.sum(dout_alpha, 0, False)
+
+    return dalpha
+
+
+@util.check_input_type(dict, dict, str)
+def fake_learned_scale_quant_perlayer_grad_d_reduce(dout_alpha, dalpha,
+                                                    kernel_name="fake_learned_scale_quant_perlayer_grad_d_reduce"):
+    """FakeLearnedScaleQuantPerLayerGradDReduce"""
+
+    dout_alpha_shape = dout_alpha.get("shape")
+    dout_alpha_dtype = dout_alpha.get("dtype")
+
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(dout_alpha_shape)
+    util.check_tensor_shape_size(dout_alpha_shape)
+
+    check_list = ["float32", 'float16']
+    dout_alpha_dtype = dout_alpha_dtype.lower()
+    util.check_dtype_rule(dout_alpha_dtype, check_list)
+
+    input_shape = (functools_reduce(lambda x, y: x * y, dout_alpha_shape[:]),)
+
+    dout_alpha_data = tvm.placeholder(input_shape, name="dout_alpha", dtype=dout_alpha_dtype)
+    res = fake_learned_scale_quant_perlayer_grad_d_reduce_compute(dout_alpha_data, kernel_name)
+
+    with tvm.target.cce():
+        sch = generic.auto_schedule(res)
+
+    tensor_list = [dout_alpha_data, res]
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list}
+
+    te.lang.cce.cce_build_code(sch, config)
--- a/mindspore/ops/operations/_quant_ops.py
+++ b/mindspore/ops/operations/_quant_ops.py
@ -24,6 +24,14 @@ from ...common import dtype as mstype

 __all__ = ["MinMaxUpdatePerLayer",
           "MinMaxUpdatePerChannel",
+           "FakeLearnedScaleQuantPerLayer",
+           "FakeLearnedScaleQuantPerLayerGrad",
+           "FakeLearnedScaleQuantPerLayerGradD",
+           "FakeLearnedScaleQuantPerLayerGradDReduce",
+           "FakeLearnedScaleQuantPerChannel",
+           "FakeLearnedScaleQuantPerChannelGrad",
+           "FakeLearnedScaleQuantPerChannelGradD",
+           "FakeLearnedScaleQuantPerChannelGradDReduce",
           "FakeQuantWithMinMaxVars",
           "FakeQuantWithMinMaxVarsGradient",
           "FakeQuantWithMinMaxVarsPerChannel",
@ -169,6 +177,321 @@ class MinMaxUpdatePerChannel(PrimitiveWithInfer):
        return min_type, max_type


+class FakeLearnedScaleQuantPerLayer(PrimitiveWithInfer):
+    r"""
+    Simulates the quantize and dequantize operations of the fake learned scale quant per-layer case in training time.
+
+    Args:
+        quant_delay (int): Quantilization delay parameter. Before delay step in training time not update
+            simulate quantization aware function. After delay step in training time begin simulate the aware
+            quantize function. Default: 0.
+        neg_trunc (bool): Whether the quantization algorithm uses nagetive truncation or not. Default: False.
+        training (bool): Training the network or not. Default: True.
+
+    Inputs:
+        - **input_x** (Tensor) : Input tensor that needs to be quantified.
+        - **alpha** (Tensor) : Value of the max clipping range of the input data `input_x`.
+        - **quant_max** (Tensor) : Value of the quantization range.
+
+    Outputs:
+        - Tensor: Simulates quantize tensor of `input_x`，with the same type and shape as the `input_x`.
+
+    Examples:
+        >>> input_tensor = Tensor(np.random.rand(3, 16, 5, 5), mstype.float32)
+        >>> alpha_tensor = Tensor(np.array([6]), mstype.float32)
+        >>> quant_max_tensor = Tensor(np.array([127]), mstype.float32)
+        >>> output_tensor = FakeLearnedScaleQuantPerLayer()(input_tensor, alpha_tensor, quant_max_tensor)
+    """
+    @prim_attr_register
+    def __init__(self,
+                 quant_delay=0,
+                 neg_trunc=False,
+                 training=True):
+        """init FakeLearnedScaleQuantPerLayer OP"""
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import fake_learned_scale_quant_perlayer
+
+        self.quant_delay = validator.check_non_negative_int(
+            quant_delay, 'quant_delay', self.name)
+        self.neg_trunc = validator.check_value_type(
+            'neg_trunc', neg_trunc, (bool,), self.name)
+        self.training = validator.check_value_type(
+            'training', training, (bool,), self.name)
+        self.init_prim_io_names(inputs=['input_x', 'alpha', 'quant_max'],
+                                outputs=['out'])
+
+    def infer_shape(self, input_x_shape, alpha_shape, quant_max_shape):
+        validator.check_int(len(input_x_shape), 1, Rel.GE, "input_x rank", self.name)
+        validator.check_int(len(alpha_shape), 1, Rel.GE, "alpha rank", self.name)
+        validator.check_int(len(quant_max_shape), 1, Rel.GE, "quant max rank", self.name)
+        return input_x_shape
+
+    def infer_dtype(self, input_x_type, alpha_type, quant_max_type):
+        if context.get_context('device_target') == "GPU":
+            valid_dtypes = (mstype.float32,)
+        else:
+            valid_dtypes = (mstype.float16, mstype.float32)
+        tuple(map(partial(validator.check_tensor_dtype_valid, valid_dtypes=valid_dtypes, prim_name=self.name),
+                  ("input_x", "alpha", "quant_max"),
+                  (input_x_type, alpha_type, quant_max_type)))
+        return input_x_type
+
+
+class FakeLearnedScaleQuantPerLayerGrad(PrimitiveWithInfer):
+    r"""
+    Performs grad of FakeLearnedScaleQuantPerLayer operation.
+
+    Examples:
+        >>> fake_learned_scale_grad = FakeLearnedScaleQuantPerLayerGrad()
+        >>> dout = Tensor(np.array([[-2.3, 1.2], [5.7, 0.2]]), mindspore.float32)
+        >>> input_x = Tensor(np.array([[18, -23], [0.2, 6]]), mindspore.float32)
+        >>> _alpha = Tensor(np.array([6]), mindspore.float32)
+        >>> _quant_max = Tensor(np.array([127]), mindspore.float32)
+        >>> result = fake_learned_scale_grad(dout, input_x, _min, _max)
+    """
+
+    @prim_attr_register
+    def __init__(self,
+                 quant_delay=0,
+                 neg_trunc=False):
+        self.quant_delay = validator.check_non_negative_int(
+            quant_delay, 'quant_delay', self.name)
+        self.neg_trunc = validator.check_value_type(
+            'neg_trunc', neg_trunc, (bool,), self.name)
+        self.init_prim_io_names(
+            inputs=['dout', 'x', 'alpha', 'quant_max'], outputs=['dx', 'dalpha'])
+
+    def infer_shape(self, dout_shape, x_shape, alpha_shape, quant_max_shape):
+        validator.check("dout shape", dout_shape, "x_shape", x_shape, Rel.EQ, self.name)
+        validator.check_int(len(alpha_shape), 1, Rel.GE, "alpha rank", self.name)
+        validator.check_int(len(quant_max_shape), 1, Rel.GE, "quant max rank", self.name)
+        return dout_shape, alpha_shape
+
+    def infer_dtype(self, dout_type, x_type, alpha_type, quant_max_type):
+        if context.get_context('device_target') == "GPU":
+            valid_dtypes = (mstype.float32,)
+        else:
+            valid_dtypes = (mstype.float16, mstype.float32)
+        tuple(map(partial(validator.check_tensor_dtype_valid, valid_dtypes=valid_dtypes, prim_name=self.name),
+                  ("dout", "x", "alpha", "quant_max"),
+                  (dout_type, x_type, alpha_type, quant_max_type)))
+        return dout_type, alpha_type
+
+
+class FakeLearnedScaleQuantPerLayerGradD(PrimitiveWithInfer):
+    r"""
+    Performs input grad of FakeLearnedScaleQuantPerLayer operation.
+    """
+
+    @prim_attr_register
+    def __init__(self,
+                 neg_trunc=False):
+        from mindspore.ops._op_impl._custom_op import fake_learned_scale_quant_perlayer_grad
+        self.neg_trunc = validator.check_value_type(
+            'neg_trunc', neg_trunc, (bool,), self.name)
+        self.init_prim_io_names(
+            inputs=['dout', 'x', 'alpha', 'quant_max'], outputs=['dx', 'dalpha'])
+
+    def infer_shape(self, dout_shape, x_shape, alpha_shape, quant_max_shape):
+        validator.check("dout shape", dout_shape, "x_shape", x_shape, Rel.EQ, self.name)
+        validator.check_int(len(alpha_shape), 1, Rel.GE, "alpha rank", self.name)
+        validator.check_int(len(quant_max_shape), 1, Rel.GE, "quant max rank", self.name)
+        return dout_shape, dout_shape
+
+    def infer_dtype(self, dout_type, x_type, alpha_type, quant_max_type):
+        valid_dtypes = (mstype.float16, mstype.float32)
+        tuple(map(partial(validator.check_tensor_dtype_valid, valid_dtypes=valid_dtypes, prim_name=self.name),
+                  ("dout", "x", "alpha", "quant_max"),
+                  (dout_type, x_type, alpha_type, quant_max_type)))
+        return dout_type, dout_type
+
+
+class FakeLearnedScaleQuantPerLayerGradDReduce(PrimitiveWithInfer):
+    r"""
+    Performs alpha grad reduce of FakeLearnedScaleQuantPerLayer operation.
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        from mindspore.ops._op_impl._custom_op import fake_learned_scale_quant_perlayer_grad_reduce
+        self.init_prim_io_names(
+            inputs=['dout_alpha'], outputs=['dalpha'])
+
+    def infer_shape(self, dout_alpha_shape):
+        return (1,)
+
+    def infer_dtype(self, dout_alpha_type):
+        valid_dtypes = (mstype.float16, mstype.float32)
+        validator.check_tensor_dtype_valid("dout_alpha", dout_alpha_type, valid_dtypes, self.name)
+        return dout_alpha_type
+
+
+class FakeLearnedScaleQuantPerChannel(PrimitiveWithInfer):
+    r"""
+    Simulates the quantize and dequantize operations of the fake learned scale quant per-chnnel case in training time.
+
+    Args:
+        quant_delay (int): Quantilization delay parameter. Before delay step in training time not update
+            simulate quantization aware function. After delay step in training time begin simulate the aware
+            quantize function. Default: 0.
+        neg_trunc (bool): Whether the quantization algorithm uses nagetive truncation or not. Default: False.
+        training (bool): Training the network or not. Default: True.
+        channel_axis (int): Quantization by channel axis. Ascend backend only supports 0 or 1. Default: 1.
+
+    Inputs:
+        - **input_x** (Tensor) : Input tensor that needs to be quantified.
+        - **alpha** (Tensor) : Value of the max clipping range of the input data `input_x`.
+        - **quant_max** (Tensor) : Value of the quantization range.
+
+    Outputs:
+        - Tensor: Simulates quantize tensor of `input_x`，with the same type and shape as the `input_x`.
+
+    Examples:
+        >>> input_tensor = Tensor(np.random.rand(3, 16, 5, 5), mstype.float32)
+        >>> alpha_tensor = Tensor(np.array([6]*3), mstype.float32)
+        >>> quant_max_tensor = Tensor(np.array([127]), mstype.float32)
+        >>> output_tensor = FakeLearnedScaleQuantPerChannel()(input_tensor, alpha_tensor, quant_max_tensor)
+    """
+    ascend_support_x_rank = [2, 4]
+
+    @prim_attr_register
+    def __init__(self,
+                 quant_delay=0,
+                 neg_trunc=False,
+                 training=True,
+                 channel_axis=1):
+        """init FakeLearnedScaleQuantPerChannel OP"""
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import fake_learned_scale_quant_perchannel
+        self.is_ascend = context.get_context('device_target') == "Ascend"
+        self.quant_delay = validator.check_non_negative_int(
+            quant_delay, 'quant_delay', self.name)
+        self.neg_trunc = validator.check_value_type(
+            'neg_trunc', neg_trunc, (bool,), self.name)
+        self.training = validator.check_value_type(
+            'training', training, (bool,), self.name)
+        if self.is_ascend:
+            self.channel_axis = validator.check_int_range(channel_axis, 0, 1, Rel.INC_BOTH, 'channel_axis', self.name)
+        else:
+            self.channel_axis = validator.check_non_negative_int(channel_axis, 'channel_axis', self.name)
+        self.init_prim_io_names(inputs=['input_x', 'alpha', 'quant_max'],
+                                outputs=['out'])
+
+    def infer_shape(self, input_x_shape, alpha_shape, quant_max_shape):
+        if self.is_ascend and len(input_x_shape) not in self.ascend_support_x_rank:
+            raise ValueError(f"For '{self.name}' x rank should be in '{self.ascend_support_x_rank}'")
+        if not self.is_ascend:
+            validator.check_int(len(input_x_shape), 1, Rel.GE, "input_x rank", self.name)
+        if len(input_x_shape) == 1:
+            self.channel_axis = 0
+
+        validator.check_equal_int(alpha_shape[0], input_x_shape[self.channel_axis], "alpha rank", self.name)
+        validator.check_int(len(quant_max_shape), 1, Rel.GE, "quant max rank", self.name)
+        return input_x_shape
+
+    def infer_dtype(self, input_x_type, alpha_type, quant_max_type):
+        if context.get_context('device_target') == "GPU":
+            valid_dtypes = (mstype.float32,)
+        else:
+            valid_dtypes = (mstype.float16, mstype.float32)
+        tuple(map(partial(validator.check_tensor_dtype_valid, valid_dtypes=valid_dtypes, prim_name=self.name),
+                  ("input_x", "alpha", "quant_max"),
+                  (input_x_type, alpha_type, quant_max_type)))
+        return input_x_type
+
+
+class FakeLearnedScaleQuantPerChannelGrad(PrimitiveWithInfer):
+    r"""
+    Performs grad of FakeLearnedScaleQuantPerChannel operation.
+
+    Examples:
+        >>> fake_learned_scale_grad = FakeLearnedScaleQuantPerChannelGrad()
+        >>> dout = Tensor(np.array([[-2.3, 1.2], [5.7, 0.2]]), mindspore.float32)
+        >>> input_x = Tensor(np.array([[18, -23], [0.2, 6]]), mindspore.float32)
+        >>> _alpha = Tensor(np.array([6]*2), mindspore.float32)
+        >>> _quant_max = Tensor(np.array([127]), mindspore.float32)
+        >>> result = fake_learned_scale_grad(dout, input_x, _min, _max)
+    """
+
+    @prim_attr_register
+    def __init__(self,
+                 quant_delay=0,
+                 neg_trunc=False,
+                 channel_axis=1):
+        self.quant_delay = validator.check_non_negative_int(
+            quant_delay, 'quant_delay', self.name)
+        self.neg_trunc = validator.check_value_type(
+            'neg_trunc', neg_trunc, (bool,), self.name)
+        self.channel_axis = validator.check_non_negative_int(channel_axis, 'channel axis', self.name)
+        self.init_prim_io_names(
+            inputs=['dout', 'x', 'alpha', 'quant_max'], outputs=['dx', 'dalpha'])
+
+    def infer_shape(self, dout_shape, x_shape, alpha_shape, quant_max_shape):
+        validator.check("dout shape", dout_shape, "x_shape", x_shape, Rel.EQ, self.name)
+        return dout_shape, alpha_shape
+
+    def infer_dtype(self, dout_type, x_type, alpha_type, quant_max_type):
+        if context.get_context('device_target') == "GPU":
+            valid_dtypes = (mstype.float32,)
+        else:
+            valid_dtypes = (mstype.float16, mstype.float32)
+        tuple(map(partial(validator.check_tensor_dtype_valid, valid_dtypes=valid_dtypes, prim_name=self.name),
+                  ("dout", "x", "alpha", "quant_max"),
+                  (dout_type, x_type, alpha_type, quant_max_type)))
+        return dout_type, alpha_type
+
+
+class FakeLearnedScaleQuantPerChannelGradD(PrimitiveWithInfer):
+    r"""
+    Performs input grad of FakeLearnedScaleQuantPerChannel operation.
+    """
+
+    @prim_attr_register
+    def __init__(self,
+                 neg_trunc=False,
+                 channel_axis=1):
+        from mindspore.ops._op_impl._custom_op import fake_learned_scale_quant_perchannel_grad
+        self.neg_trunc = validator.check_value_type(
+            'neg_trunc', neg_trunc, (bool,), self.name)
+        self.channel_axis = validator.check_non_negative_int(channel_axis, 'channel axis', self.name)
+        self.init_prim_io_names(
+            inputs=['dout', 'x', 'alpha', 'quant_max'], outputs=['dx', 'dalpha'])
+
+    def infer_shape(self, dout_shape, x_shape, alpha_shape, quant_max_shape):
+        validator.check("dout shape", dout_shape, "x_shape", x_shape, Rel.EQ, self.name)
+        validator.check_int(len(alpha_shape), 1, Rel.GE, "alpha rank", self.name)
+        validator.check_int(len(quant_max_shape), 1, Rel.GE, "quant max rank", self.name)
+        return dout_shape, dout_shape
+
+    def infer_dtype(self, dout_type, x_type, alpha_type, quant_max_type):
+        valid_dtypes = (mstype.float16, mstype.float32)
+        tuple(map(partial(validator.check_tensor_dtype_valid, valid_dtypes=valid_dtypes, prim_name=self.name),
+                  ("dout", "x", "alpha", "quant_max"),
+                  (dout_type, x_type, alpha_type, quant_max_type)))
+        return dout_type, dout_type
+
+
+class FakeLearnedScaleQuantPerChannelGradDReduce(PrimitiveWithInfer):
+    r"""
+    Performs alpha grad reduce of FakeLearnedScaleQuantPerChannel operation.
+    """
+
+    @prim_attr_register
+    def __init__(self, channel_axis=1):
+        from mindspore.ops._op_impl._custom_op import fake_learned_scale_quant_perchannel_grad_reduce
+        self.channel_axis = validator.check_non_negative_int(channel_axis, 'channel axis', self.name)
+        self.init_prim_io_names(
+            inputs=['dout_alpha'], outputs=['dalpha'])
+
+    def infer_shape(self, dout_alpha_shape):
+        return (dout_alpha_shape[self.channel_axis],)
+
+    def infer_dtype(self, dout_alpha_type):
+        valid_dtypes = (mstype.float16, mstype.float32)
+        validator.check_tensor_dtype_valid("dout_alpha", dout_alpha_type, valid_dtypes, self.name)
+        return dout_alpha_type
+
+
 class FakeQuantWithMinMaxVars(PrimitiveWithInfer):
    r"""
    Fake-quantize the input by min and max.
--- a/model_zoo/official/cv/mobilenetv2_quant/README_CN.md
+++ b/model_zoo/official/cv/mobilenetv2_quant/README_CN.md
@ -59,13 +59,20 @@ MobileNetV2总体网络架构如下：

 ## 混合精度

-采用[混合精度](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/enable_mixed_precision.html)的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度，同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时，支持在特定硬件上训练更大的模型或实现更大批次的训练。
+采用[混合精度](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/enable_mixed_precision.html)
+的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度，同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时，支持在特定硬件上训练更大的模型或实现更大批次的训练。
 以FP16算子为例，如果输入数据类型为FP32，MindSpore后台会自动降低精度来处理数据。用户可打开INFO日志，搜索“reduce precision”查看精度降低的算子。

+## 量化步长可学习的量化感知训练
+
+参考论文[Learned Step Size Quantization](https://arxiv.org/abs/1902.08153)
+对量化感知训练进优化，量化步长由训练学习得到，该特性对低比特量化场景有较好收益，下述中简称为LSQ。
+用户可自由选择是否使用LSQ量化方案进行量化。
+
 # 环境要求

- 硬件：昇腾处理器（Ascend）
-    - 使用昇腾处理器来搭建硬件环境。
+- 硬件  (Ascend/GPU)
+    - 使用Ascend或GPU来搭建硬件环境。
 - 框架
    - [MindSpore](https://www.mindspore.cn/install)
 - 如需查看详情，请参见如下资源
@ -80,8 +87,10 @@ MobileNetV2总体网络架构如下：
 ├── mobileNetv2_quant
  ├── Readme.md     # MobileNetV2-Quant相关描述
  ├── scripts
-  │   ├──run_train.sh   # 使用昇腾处理器进行训练的shell脚本
-  │   ├──run_infer.sh    # 使用昇腾处理器进行评估的shell脚本
+  │   ├──run_train.sh    # 使用Ascend或GPU进行训练的shell脚本
+  │   ├──run_infer.sh    # 使用Ascend或GPU进行评估的shell脚本
+  │   ├──run_lsq_train.sh    # 使用Ascend或GPU进行LSQ训练的shell脚本
+  │   ├──run_lsq_infer.sh    # 使用Ascend或GPU进行LSQ评估的shell脚本
  ├── src
  │   ├──config.py      # 参数配置
  │   ├──dataset.py     # 创建数据集
@ -91,14 +100,14 @@ MobileNetV2总体网络架构如下：
  │   ├──utils.py       # 提供监控模块
  ├── train.py      # 训练脚本
  ├── eval.py       # 评估脚本
-  ├── export.py     # 导出检查点文件到air/onnx中
+  ├── export.py     # 导出检查点文件到air/mindir中
 ```

 ## 脚本参数

 在config.py中可以同时配置训练参数和评估参数。

- 配置MobileNetV2-quant和ImageNet2012数据集。
+- 配置MobileNetV2-quant和ImageNet2012数据集（以Ascend环境配置为例，详见src/config.py ）。

  ```python
  'num_classes':1000       # 数据集类数
@ -124,15 +133,42 @@ MobileNetV2总体网络架构如下：

 使用python或shell脚本开始训练。shell脚本的使用方法如下：

+传统量化感知训练（默认）：
+
 - bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\（可选）
+
 - bash run_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\（可选）

+量化步长可学习的量化感知训练：
+
+- bash run_lsq_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]
+
+- bash run_lsq_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH]
+
+ `PRETRAINED_CKPT_PATH` 是可选择的选项，如果用户配置该选项，则基于用户指定的预训练模型进行量化，我们更推荐用户基于预训练模型量化。
+
+ `RANK_TABLE_FILE` 是在Ascned上运行分布式任务时HCCL的配置文件。
+
 ### 启动

 ``` bash
-  # 训练示例
-  >>> bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt
-  >>> bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt
+  # 训练示例-传统量化感知训练（默认）
+  python：
+          Ascend:  python train.py --device_target Ascend --dataset_path ~/imagenet/train/
+          GPU:  python train.py --device_target GPU --dataset_path ~/imagenet/train/
+  shell：
+          Ascend: bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt
+          GPU: bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt
+
+  # 训练示例-量化步长可学习的量化感知训练  
+  python：
+          Ascend:  python train.py --device_target Ascend --dataset_path ~/imagenet/train/ \
+                   --pre_trained ~/mobilenet.ckpt --optim_option "LEARNED_SCALE"
+          GPU:  python train.py --device_target GPU --dataset_path ~/imagenet/train/ \
+                --pre_trained ~/mobilenet.ckpt --optim_option "LEARNED_SCALE"
+  shell：
+          Ascend: bash run_lsq_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt
+          GPU: bash run_lsq_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt
 ```

 ### 结果
@ -151,16 +187,38 @@ epoch time:138331.250, per step time:221.330, avg loss:3.917

 ### 用法

-使用python或shell脚本开始训练。shell脚本的使用方法如下：
+使用python或shell脚本开始评估。shell脚本的使用方法如下：

- Ascend: sh run_infer_quant.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH]
+传统量化感知训练（默认）：
+
+- Ascend: sh run_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH]
+- GPU: sh run_infer.sh GPU [DATASET_PATH] [CHECKPOINT_PATH]
+
+量化步长可学习的量化感知训练：
+
+- Ascend: sh run_lsq_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH]
+- GPU: sh run_lsq_infer.sh GPU [DATASET_PATH] [CHECKPOINT_PATH]

 ### 启动

 ```bash
-# 推理示例
-  shell:
-      Ascend: sh run_infer_quant.sh Ascend ~/imagenet/val/ ~/train/mobilenet-60_1601.ckpt
+# 推理示例-传统量化感知训练（默认）
+python：
+       Ascend:  python eval.py --device_target Ascend --dataset_path [VAL_DATASET_PATH] --checkpoint_path ~/train/mobilenet-60_1601.ckpt
+       GPU:  python eval.py --device_target GPU --dataset_path [VAL_DATASET_PATH] --checkpoint_path ~/train/mobilenet-60_1601.ckpt
+shell:
+      Ascend: sh run_infer.sh Ascend ~/imagenet/val/ ~/train/mobilenet-60_1601.ckpt
+      GPU: sh run_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-60_1601.ckpt
+
+# 推理示例-量化步长可学习的量化感知训练
+python：
+       Ascend:  python eval.py --device_target Ascend --dataset_path ~/imagenet/val/ \
+                --checkpoint_path ~/train/mobilenet-60_1601.ckpt --optim_option "LEARNED_SCALE"
+       GPU:  python eval.py --device_target GPU --dataset_path ~/imagenet/val/ \
+             --checkpoint_path ~/train/mobilenet-60_1601.ckpt --optim_option "LEARNED_SCALE"
+shell:
+      Ascend: sh run_lsq_infer.sh Ascend ~/imagenet/val/ ~/train/mobilenet-60_1601.ckpt
+      GPU: sh run_lsq_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-60_1601.ckpt
 ```

 > 训练过程中可以生成检查点。
@ -173,45 +231,59 @@ epoch time:138331.250, per step time:221.330, avg loss:3.917
 result:{'acc':0.71976314102564111}
 ```

+## 模型导出
+
+```shell
+python export.py --checkpoint_path [CKPT_PATH] --file_format [EXPORT_FORMAT] --device_target [PLATFORM] --optim_option [OptimizeOption]
+```
+
+`EXPORT_FORMAT` 可选 ["AIR", "MINDIR"].
+
+`OptimizeOption` 可选 ["QAT", "LEARNED_SCALE"].
+
 # 模型描述

 ## 性能

 ### 训练性能

-| 参数                 | MobilenetV2                                                |
-| -------------------------- | ---------------------------------------------------------- |
-| 模型版本              | V2                                                         |
-| 资源                   | Ascend 910；CPU 2.60GHz，192核；内存 755G；系统 Euler2.8               |
-| 上传日期              | 2020-06-06                                                 |
-| MindSpore版本          | 0.3.0                                                      |
-| 数据集                    | ImageNet                                                   |
-| 训练参数        | src/config.py                                              |
-| 优化器                  | Momentum                                                   |
-| 损失函数              | Softmax交叉熵                                        |
-| 输出                    | ckpt文件                                                  |
-| 损失                       | 1.913                                                      |
-| 准确率                   |                                                            |
-| 总时长                 | 16 h                                                        |
-| 参数(M)                 | batch_size=192, epoch=60                                   |
-| 微调检查点 |                                                            |
-| 推理模型        |                                                            |
+| 参数            | MobilenetV2                             |   MobilenetV2                |
+| ---------------| ----------------------------------------| ---------------------------- |
+| 模型版本         | V2                                     | V2                          |
+| 量化方案         | 传统量化感知训练（默认）                    |量化步长可学习的量化感知训练      |
+| 量化策略         | W:8bit, A:8bit                          | W:4bit (首尾层为 8bit), A:8bit|
+| 资源            | Ascend 910；CPU 2.60GHz，192核；内存 755G；系统 Euler2.8  |Ascend 910；CPU 2.60GHz，192核；内存 755G；系统 Euler2.8 |
+| 上传日期         | 2020-06-06                             |2021-04-30                   |
+| MindSpore版本   | 0.3.0                                  |1.3.0                         |
+| 数据集          | ImageNet                               |ImageNet                      |
+| 训练参数         | src/config.py                          |src/config.py                 |
+| 优化器           | Momentum                              |Momentum                      |
+| 损失函数         | Softmax交叉熵                          |Softmax交叉熵                  |
+| 输出            | ckpt文件                               |ckpt文件                       |
+| 损失            | 1.913                                 |                              |
+| 准确率          |                                       |                              |
+| 总时长          | 16 h                                  |                              |
+| 参数(M)         | batch_size=192, epoch=60             |batch_size=192, epoch=40      |
+| 微调检查点       |                                      |                              |
+| 推理模型        |                                       |                               |

 #### 评估性能

-| 参数                 |                               |
-| -------------------------- | ----------------------------- |
-| 模型版本              | V2                            |
-| 资源                   | Ascend 910；系统 Euler2.8                     |
-| 上传日期              | 2020-06-06                    |
-| MindSpore版本          | 0.3.0                         |
-| 数据集                    | ImageNet, 1.2W                |
-| 批次大小                 | 130（8P）                       |
-| 输出                    | 概率                   |
-| 准确率                   | ACC1[71.78%] ACC5[90.90%]     |
-| 速度                      | 200毫秒/步                    |
-| 总时长                 | 5分钟                          |
-| 推理模型        |                               |
+| 参数                |                           |                               |
+| ------------------ | --------------------------|------------------------------ |
+| 模型版本            | V2                         |V2                            |
+| 量化方案            | 传统量化感知训练（默认）       |量化步长可学习的量化感知训练       |
+| 量化策略            | W:8bit, A:8bit             | W:4bit (首尾层为 8bit), A:8bit|
+| 资源               | Ascend 910；系统 Euler2.8   |  Ascend 910；系统 Euler2.8    |
+| 上传日期            | 2020-06-06                |2021-04-30                    |
+| MindSpore版本      | 0.3.0                     |  1.3.0                       |
+| 数据集             | ImageNet, 1.2W            |  ImageNet, 1.2W              |
+| 批次大小           | 130（8P）                  |                              |
+| 输出              | 概率                       |  概率                         |
+| 准确率            | ACC1[71.78%] ACC5[90.90%] |                              |
+| 速度              | 200毫秒/步                |                              |
+| 总时长            | 5分钟                     |                              |
+| 推理模型          |                          |                               |

 # 随机情况说明

--- a/model_zoo/official/cv/mobilenetv2_quant/Readme.md
+++ b/model_zoo/official/cv/mobilenetv2_quant/Readme.md
@ -49,13 +49,20 @@ Dataset used: [imagenet](http://www.image-net.org/)
 The [mixed precision](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/enable_mixed_precision.html) training method accelerates the deep learning neural network training process by using both the single-precision and half-precision data formats, and maintains the network precision achieved by the single-precision training at the same time. Mixed precision training can accelerate the computation process, reduce memory usage, and enable a larger model or batch size to be trained on specific hardware.
 For FP16 operators, if the input data type is FP32, the backend of MindSpore will automatically handle it with reduced precision. Users could check the reduced-precision operators by enabling INFO log and then searching ‘reduce precision’.

+## [Learned Step Size Quantization](#contents)
+
+Inspired by paper [Learned Step Size Quantization](https://arxiv.org/abs/1902.08153)
+, we proposed an optimize option, whose quantization scale is learned during the fine-tune process.
+This feature has good benefits for low bits quantization scenarios, which is referred to as LSQ.
+Users are free to choose whether to use the LEARNED_SCALE optimize option for quantization.
+
 # [Environment Requirements](#contents)

- Hardware:Ascend
-    - Prepare hardware environment with Ascend.
+- Hardware (Ascend/GPU)
+    - Prepare hardware environment with Ascend or GPU.
 - Framework
  - [MindSpore](https://www.mindspore.cn/install/en)
- For more information, please check the resources below
+- For more information, please check the resources below:
  - [MindSpore Tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html)
  - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html)

@ -67,8 +74,10 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil
 ├── mobileNetv2_quant
  ├── Readme.md     # descriptions about MobileNetV2-Quant
  ├── scripts
-  │   ├──run_train.sh   # shell script for train on Ascend
-  │   ├──run_infer.sh    # shell script for evaluation on Ascend
+  │   ├──run_train.sh    # shell script for train on Ascend or GPU
+  │   ├──run_infer.sh    # shell script for evaluation on Ascend or GPU
+  │   ├──run_lsq_train.sh    # shell script for train (using the LEARNED_SCALE optimize option) on Ascend or GPU
+  │   ├──run_lsq_infer.sh    # shell script for evaluation (using the LEARNED_SCALE optimize option) on Ascend or GPU
  ├── src
  │   ├──config.py      # parameter configuration
  │   ├──dataset.py     # creating dataset
@ -85,7 +94,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil

 Parameters for both training and evaluation can be set in config.py

- config for MobileNetV2-quant, ImageNet2012 dataset
+- config for MobileNetV2-quant, ImageNet2012 dataset（We take the environment configuration of ascend as an example here, and you will get more detail in src/config.py）

  ```python
  'num_classes': 1000       # the number of classes in the dataset
@ -111,15 +120,44 @@ Parameters for both training and evaluation can be set in config.py

 You can start training using python or shell scripts. The usage of shell scripts as follows:

+For quantization aware training (default):
+
 - bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)
 - bash run_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)

+For Learned Step Size Quantization:
+
+- bash run_lsq_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]
+- bash run_lsq_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH]
+
+`PRETRAINED_CKPT_PATH`  is optional. If it is given, quantization is based on the specified pre training ckpt file. We recommend users to execute quantization based on the pre training ckpt file.
+
+`RANK_TABLE_FILE` is HCCL configuration file when running on Ascend.
+> The common restrictions on using the distributed service are as follows. For details, see the HCCL documentation.
+>
+> - In a single-node system, a cluster of 1, 2, 4, or 8 devices is supported. In a multi-node system, a cluster of 8 x N devices is supported.
+> - Each host has four devices numbered 0 to 3 and four devices numbered 4 to 7 deployed on two different networks. During training of 2 or 4 devices, the devices must be connected and clusters cannot be created across networks.
+
 ### Launch

 ``` bash
-  # training example
-  >>> bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt
-  >>> bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt
+  # training example for quantization aware training (default)
+  python：
+          Ascend:  python train.py --device_target Ascend --dataset_path ~/imagenet/train/
+          GPU:  python train.py --device_target GPU --dataset_path ~/imagenet/train/
+  shell：
+          Ascend: bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt
+          GPU: bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt
+
+  # training example for Learned Step Size Quantization
+  python：
+          Ascend:  python train.py --device_target Ascend --dataset_path ~/imagenet/train/ \
+                   --pre_trained ~/mobilenet.ckpt --optim_option "LEARNED_SCALE"
+          GPU:  python train.py --device_target GPU --dataset_path ~/imagenet/train/ \
+                --pre_trained ~/mobilenet.ckpt --optim_option "LEARNED_SCALE"
+  shell：
+          Ascend: bash run_lsq_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt
+          GPU: bash run_lsq_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt
 ```

 ### Result
@ -138,16 +176,40 @@ epoch time: 138331.250, per step time: 221.330, avg loss: 3.917

 ### Usage

-You can start training using python or shell scripts. The usage of shell scripts as follows:
+You can start evaluating using python or shell scripts. The usage of shell scripts as follows:

- Ascend: sh run_infer_quant.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH]
+For quantization aware training (default):
+
+- Ascend: sh run_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH]
+- GPU: sh run_infer.sh GPU [DATASET_PATH] [CHECKPOINT_PATH]
+
+For Learned Step Size Quantization:
+
+- Ascend: sh run_lsq_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH]
+- GPU: sh run_lsq_infer.sh GPU [DATASET_PATH] [CHECKPOINT_PATH]

 ### Launch

-``` bash
-# infer example
-  shell:
-      Ascend: sh run_infer_quant.sh Ascend ~/imagenet/val/ ~/train/mobilenet-60_1601.ckpt
+```bash
+# training example for quantization aware training (default)
+python：
+       Ascend:  python eval.py --device_target Ascend --dataset_path [VAL_DATASET_PATH] --checkpoint_path ~/train/mobilenet-60_1601.ckpt
+       GPU:  python eval.py --device_target GPU --dataset_path [VAL_DATASET_PATH] --checkpoint_path ~/train/mobilenet-60_1601.ckpt
+
+shell:
+      Ascend: sh run_infer.sh Ascend ~/imagenet/val/ ~/train/mobilenet-60_1601.ckpt
+      GPU: sh run_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-60_1601.ckpt
+
+# training example for Learned Step Size Quantization
+python：
+       Ascend:  python eval.py --device_target Ascend --dataset_path ~/imagenet/val/ \
+                --checkpoint_path ~/train/mobilenet-60_1601.ckpt --optim_option "LEARNED_SCALE"
+       GPU:  python eval.py --device_target GPU --dataset_path ~/imagenet/val/ \
+             --checkpoint_path ~/train/mobilenet-60_1601.ckpt --optim_option "LEARNED_SCALE"
+
+shell:
+      Ascend: sh run_lsq_infer.sh Ascend ~/imagenet/val/ ~/train/mobilenet-60_1601.ckpt
+      GPU: sh run_lsq_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-60_1601.ckpt
 ```

 > checkpoint can be produced in training process.
@ -160,45 +222,58 @@ Inference result will be stored in the example path, you can find result like th
 result: {'acc': 0.71976314102564111}
 ```

+## [Model Export](#contents)
+
+```shell
+python export.py --checkpoint_path [CKPT_PATH] --file_format [EXPORT_FORMAT] --device_target [PLATFORM] --optim_option [OptimizeOption]
+```
+
+`EXPORT_FORMAT` should be in ["AIR", "MINDIR"].
+`OptimizeOption` should be in ["QAT", "LEARNED_SCALE"].
+
 # [Model description](#contents)

 ## [Performance](#contents)

 ### Training Performance

-| Parameters                 | MobilenetV2                                                |
-| -------------------------- | ---------------------------------------------------------- |
-| Model Version              | V2                                                         |
-| Resource                   | Ascend 910; cpu 2.60GHz, 192cores; memory 755G; OS Euler2.8               |
-| uploaded Date              | 06/06/2020                                                 |
-| MindSpore Version          | 0.3.0                                                      |
-| Dataset                    | ImageNet                                                   |
-| Training Parameters        | src/config.py                                              |
-| Optimizer                  | Momentum                                                   |
-| Loss Function              | SoftmaxCrossEntropy                                        |
-| outputs                    | ckpt file                                                  |
-| Loss                       | 1.913                                                      |
-| Accuracy                   |                                                            |
-| Total time                 | 16h                                                        |
-| Params (M)                 | batch_size=192, epoch=60                                   |
-| Checkpoint for Fine tuning |                                                            |
-| Model for inference        |                                                            |
+| Parameters                 | MobilenetV2                                       | MobilenetV2                                       |
+| -------------------------- | --------------------------------------------------| --------------------------------------------------|
+| Model Version              | V2                                                | V2                                                |
+| Optimize Option            | QAT                                               | LEARNED_SCALE                                     |
+| Quantization Strategy      | W:8bit, A:8bit                                    | W:4bit (The first and last layers are 8bit), A:8bit|
+| Resource                   | Ascend 910; cpu 2.60GHz, 192cores; memory 755G; OS Euler2.8     | Ascend 910; cpu 2.60GHz, 192cores; memory 755G; OS Euler2.8     |
+| uploaded Date              | 06/06/2020                                        | 04/30/2021                                        |
+| MindSpore Version          | 0.3.0                                             | 1.3.0                                             |
+| Dataset                    | ImageNet                                          | ImageNet                                          |
+| Training Parameters        | src/config.py                                     | src/config.py                                     |
+| Optimizer                  | Momentum                                          | Momentum                                          |
+| Loss Function              | SoftmaxCrossEntropy                               | SoftmaxCrossEntropy                               |
+| outputs                    | ckpt file                                         | ckpt file                                         |
+| Loss                       | 1.913                                             |                                                   |
+| Accuracy                   |                                                   |                                                   |
+| Total time                 | 16h                                               |                                                   |
+| Params (M)                 | batch_size=192, epoch=60                          | batch_size=192, epoch=40                          |
+| Checkpoint for Fine tuning |                                                   |                                                   |
+| Model for inference        |                                                   |                                                   |

 #### Evaluation Performance

-| Parameters                 |                               |
-| -------------------------- | ----------------------------- |
-| Model Version              | V2                            |
-| Resource                   | Ascend 910; OS Euler2.8                     |
-| uploaded Date              | 06/06/2020                    |
-| MindSpore Version          | 0.3.0                         |
-| Dataset                    | ImageNet, 1.2W                |
-| batch_size                 | 130(8P)                       |
-| outputs                    | probability                   |
-| Accuracy                   | ACC1[71.78%] ACC5[90.90%]     |
-| Speed                      | 200ms/step                    |
-| Total time                 | 5min                          |
-| Model for inference        |                               |
+| Parameters                 |                               |                               |
+| -------------------------- | ----------------------------- | ----------------------------- |
+| Model Version              | V2                            | V2                            |
+| Optimize Option            | QAT                           | LEARNED_SCALE                 |
+| Quantization Strategy      | W:8bit, A:8bit                | W:4bit (The first and last layers are 8bit), A:8bit|
+| Resource                   | Ascend 910; OS Euler2.8       | Ascend 910; OS Euler2.8       |
+| uploaded Date              | 06/06/2020                    | 04/30/2021                    |
+| MindSpore Version          | 0.3.0                         | 1.3.0                         |
+| Dataset                    | ImageNet, 1.2W                | ImageNet, 1.2W                |
+| batch_size                 | 130(8P)                       |                               |
+| outputs                    | probability                   | probability                   |
+| Accuracy                   | ACC1[71.78%] ACC5[90.90%]     |                               |
+| Speed                      | 200ms/step                    |                               |
+| Total time                 | 5min                          |                               |
+| Model for inference        |                               |                               |

 # [Description of Random Situation](#contents)

--- a/model_zoo/official/cv/mobilenetv2_quant/eval.py
+++ b/model_zoo/official/cv/mobilenetv2_quant/eval.py
@ -21,42 +21,67 @@ from mindspore import context
 from mindspore import nn
 from mindspore.train.model import Model
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
+from mindspore.compression.common import QuantDtype
 from mindspore.compression.quant import QuantizationAwareTraining
-
+from mindspore.compression.quant.quantizer import OptimizeOption
 from src.mobilenetV2 import mobilenetV2
+from src.mobilenetv2_mix_quant import mobilenetv2_mix_quant
 from src.dataset import create_dataset
-from src.config import config_ascend_quant
-from src.config import config_gpu_quant
+from src.config import config_ascend_quant, config_gpu_quant, config_lsq_ascend_quant, config_lsq_gpu_quant

 parser = argparse.ArgumentParser(description='Image classification')
 parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
 parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
 parser.add_argument('--device_target', type=str, default=None, help='Run device target')
+parser.add_argument('--optim_option', type=str, default="QAT", help='OptimizeOption')
 args_opt = parser.parse_args()

 if __name__ == '__main__':
    config_device_target = None
    device_id = int(os.getenv('DEVICE_ID'))
    if args_opt.device_target == "Ascend":
-        config_device_target = config_ascend_quant
+        if args_opt.optim_option == "LEARNED_SCALE":
+            config_device_target = config_lsq_ascend_quant
+        else:
+            config_device_target = config_ascend_quant
+            symmetric_list = [True, False]
        context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
                            device_id=device_id, save_graphs=False)
-        symmetric_list = [True, False]
    elif args_opt.device_target == "GPU":
-        config_device_target = config_gpu_quant
+        if args_opt.optim_option == "LEARNED_SCALE":
+            config_device_target = config_lsq_gpu_quant
+        else:
+            config_device_target = config_gpu_quant
+            symmetric_list = [False, False]
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU",
                            device_id=device_id, save_graphs=False)
-        symmetric_list = [False, False]
    else:
        raise ValueError("Unsupported device target: {}.".format(args_opt.device_target))

-    # define fusion network
-    network = mobilenetV2(num_classes=config_device_target.num_classes)
-    # convert fusion network to quantization aware network
-    quantizer = QuantizationAwareTraining(bn_fold=True,
-                                          per_channel=[True, False],
-                                          symmetric=symmetric_list)
+
+    if args_opt.optim_option == "LEARNED_SCALE":
+        # define fusion network
+        network = mobilenetv2_mix_quant(num_classes=config_device_target.num_classes)
+        # convert fusion network to quantization aware network
+        quant_optim_otions = OptimizeOption.LEARNED_SCALE
+        quantizer = QuantizationAwareTraining(bn_fold=True,
+                                              per_channel=[True, False],
+                                              symmetric=[True, True],
+                                              narrow_range=[True, True],
+                                              quant_dtype=(QuantDtype.INT4, QuantDtype.INT8),
+                                              freeze_bn=0,
+                                              quant_delay=0,
+                                              one_conv_fold=True,
+                                              optimize_option=quant_optim_otions)
+    else:
+        # define fusion network
+        network = mobilenetV2(num_classes=config_device_target.num_classes)
+        # convert fusion network to quantization aware network
+        quantizer = QuantizationAwareTraining(bn_fold=True,
+                                              per_channel=[True, False],
+                                              symmetric=symmetric_list)
    network = quantizer.quantize(network)
+
    # define network loss
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')

--- a/model_zoo/official/cv/mobilenetv2_quant/export.py
+++ b/model_zoo/official/cv/mobilenetv2_quant/export.py
@ -19,26 +19,45 @@ import numpy as np

 import mindspore
 from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export
+from mindspore.compression.common import QuantDtype
 from mindspore.compression.quant import QuantizationAwareTraining
-
+from mindspore.compression.quant.quantizer import OptimizeOption
 from src.mobilenetV2 import mobilenetV2
+from src.mobilenetv2_mix_quant import mobilenetv2_mix_quant
 from src.config import config_quant

 parser = argparse.ArgumentParser(description='Image classification')
 parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
 parser.add_argument("--file_format", type=str, choices=["AIR", "MINDIR"], default="MINDIR", help="file format")
 parser.add_argument('--device_target', type=str, default=None, help='Run device target')
+parser.add_argument('--optim_option', type=str, default="QAT", help='OptimizeOption')
 args_opt = parser.parse_args()

 if __name__ == '__main__':
    cfg = config_quant(args_opt.device_target)
    context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, save_graphs=False)
-    # define fusion network
-    network = mobilenetV2(num_classes=cfg.num_classes)
-    # convert fusion network to quantization aware network
-    quantizer = QuantizationAwareTraining(bn_fold=True,
-                                          per_channel=[True, False],
-                                          symmetric=[True, False])
+
+    if args_opt.optim_option == "LEARNED_SCALE":
+        # define fusion network
+        network = mobilenetv2_mix_quant(num_classes=cfg.num_classes)
+        # convert fusion network to quantization aware network
+        quant_optim_otions = OptimizeOption.LEARNED_SCALE
+        quantizer = QuantizationAwareTraining(bn_fold=True,
+                                              per_channel=[True, False],
+                                              symmetric=[True, True],
+                                              narrow_range=[True, True],
+                                              quant_dtype=(QuantDtype.INT4, QuantDtype.INT8),
+                                              freeze_bn=0,
+                                              quant_delay=0,
+                                              one_conv_fold=True,
+                                              optimize_option=quant_optim_otions)
+    else:
+        # define fusion network
+        network = mobilenetV2(num_classes=cfg.num_classes)
+        # convert fusion network to quantization aware network
+        quantizer = QuantizationAwareTraining(bn_fold=True,
+                                              per_channel=[True, False],
+                                              symmetric=[True, False])
    network = quantizer.quantize(network)
    # load checkpoint
    param_dict = load_checkpoint(args_opt.checkpoint_path)
--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_lsq_infer.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_lsq_infer.sh
@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+if [ $# != 3 ]
+then
+    echo "Ascend: sh run_lsq_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH]"
+exit 1
+fi
+
+# check dataset path
+if [ ! -d $2 ] && [ ! -f $2 ]
+then
+    echo "error: DATASET_PATH=$2 is not a directory or file"
+exit 1
+fi
+
+# check checkpoint file
+if [ ! -f $3 ]
+then
+    echo "error: CHECKPOINT_PATH=$3 is not a file"
+exit 1
+fi
+
+# set environment
+BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+export DEVICE_ID=0
+export RANK_ID=0
+export RANK_SIZE=1
+if [ -d "../eval" ];
+then
+    rm -rf ../eval
+fi
+mkdir ../eval
+cd ../eval || exit
+
+# launch
+python ${BASEPATH}/../eval.py \
+        --device_target=$1 \
+        --dataset_path=$2 \
+        --checkpoint_path=$3 \
+        --optim_option="LEARNED_SCALE" \
+        &> infer.log &  # dataset val folder path
--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_lsq_train.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_lsq_train.sh
@ -0,0 +1,177 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+get_real_path(){
+    if [ "${1:0:1}" == "/" ]; then
+        echo "$1"
+    else
+        echo "$(realpath -m $PWD/$1)"
+    fi
+}
+
+
+get_gpu_device_num(){
+
+    #device_list=(${1//,/ })
+    IFS=',' read -ra device_list <<<"$1"
+    device_used=(0 0 0 0 0 0 0 0)
+    device_num=0
+    for var in "${device_list[@]}"
+    do  
+        if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ]
+        then 
+            echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!"
+            exit 1
+        fi
+
+        if [  ${device_used[$((var))]} -eq 0 ]
+        then 
+            device_used[ $((var)) ]=1
+            device_num=$((device_num+1))
+        fi
+    done
+
+    echo ${device_num}
+}
+
+
+run_ascend(){
+
+    if [ $# -gt 4 ] || [ $# -lt 4 ]
+    then
+        echo "Usage:  bash run_lsq_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\n "
+        exit 1
+    fi
+    PATH1=$(get_real_path $2)
+    PATH2=$(get_real_path $3)
+    PATH3=$(get_real_path $4)
+
+    if [ ! -f $PATH1 ]
+    then 
+        echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
+        exit 1
+    fi 
+
+    if [ ! -d $PATH2 ]
+    then 
+        echo "error: DATASET_PATH=$PATH2 is not a directory"
+    exit 1
+    fi 
+
+    if [ $# == 4 ] && [ ! -f $PATH3 ]
+    then
+        echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
+        exit 1
+    fi
+
+
+
+    #rank_file_name=${2##*/}
+    #IFS='_' read -ra array <<<"${rank_file_name}"
+    #device_id_list=${array[2]}
+    #first_device=${device_id_list:0:1}
+    #last_device=${device_list:${#device_list}-1:1}
+    #device_num=${#device_id_list}
+    cat $2 | awk -F "[device_id]" '/device_id/{print$0}' >temp.log
+    array=$(cat temp.log | awk -F "[:]" '/device_id/{print$2}')
+    
+    IFS=" " read -ra device_list <<<$array
+    first_device=${device_list[0]:1:1}
+    #device_num=${#device_list[*]}
+    device_num=$(cat temp.log | wc -l)
+    rm temp.log
+    ulimit -u unlimited
+    export DEVICE_NUM=${device_num}
+    export RANK_SIZE=${device_num}
+    export RANK_TABLE_FILE=$PATH1
+
+    export SERVER_ID=0
+    rank_start=$((DEVICE_NUM * SERVER_ID))
+
+    rm -rf ./train
+    mkdir ./train
+    for((i=0; i<${device_num}; i++))
+    do
+        export DEVICE_ID=$((first_device+i))
+        export RANK_ID=$((rank_start + i))
+        mkdir ./train/device$i
+        cp ../*.py ./train/device$i
+        cp *.sh ./train/device$i
+        cp -r ../src ./train/device$i
+        cd ./train/device$i || exit
+        echo "start training for rank $RANK_ID, device $DEVICE_ID"
+        env > env.log
+        python train.py --device_target=$1  --dataset_path=$PATH2 --pre_trained=$PATH3 \
+        --optim_option="LEARNED_SCALE" &> train.log &
+
+        cd ../.. || exit
+    done
+}
+
+run_gpu(){
+    if [ $# -gt 4 ] || [ $# -lt 4 ]
+    then
+        echo "Usage:  bash run_lsq_train.sh  [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\n "
+        exit 1
+    fi
+
+    PATH1=$(get_real_path $3)
+    PATH2=$(get_real_path $4)
+
+    if [ ! -d $PATH1 ]
+    then 
+        echo "error: DATASET_PATH=$PATH1 is not a directory"
+        exit 1
+    fi 
+
+    if [ $# == 4 ] && [ ! -f $PATH2 ]
+    then
+        echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
+        exit 1
+    fi
+
+    device_num=$(get_gpu_device_num $2)
+
+    ulimit -u unlimited
+    export DEVICE_NUM=${device_num}
+    export RANK_SIZE=${device_num}
+    export CUDA_VISIBLE_DEVICES=$2
+    
+    rm -rf ./train
+    mkdir ./train
+    cp ../*.py ./train
+    cp *.sh ./train
+    cp -r ../src ./train
+    cd ./train || exit
+    echo "start training"
+    env > env.log
+    
+    mpirun --allow-run-as-root -n ${RANK_SIZE} --output-filename log_output --merge-stderr-to-stdout \
+    python train.py --device_target=$1  --dataset_path=$PATH1 --pre_trained=$PATH2 \
+    --optim_option="LEARNED_SCALE" &> train.log &
+
+    cd ..
+}
+
+
+if [ $1 = "Ascend" ] ; then
+    run_ascend "$@"
+elif [ $1 = "GPU" ] ; then
+    run_gpu "$@"
+else
+    echo "Unsupported device target: $1"
+fi;
+
--- a/model_zoo/official/cv/mobilenetv2_quant/src/config.py
+++ b/model_zoo/official/cv/mobilenetv2_quant/src/config.py
@ -55,6 +55,45 @@ config_gpu_quant = ed({
    "save_checkpoint_path": "./checkpoint",
 })

+config_lsq_gpu_quant = ed({
+    "num_classes": 1000,
+    "image_height": 224,
+    "image_width": 224,
+    "batch_size": 128,
+    "epoch_size": 60,
+    "start_epoch": 200,
+    "warmup_epochs": 0,
+    "lr": 0.05,
+    "momentum": 0.9,
+    "weight_decay": 4e-5,
+    "label_smooth": 0.1,
+    "loss_scale": 1024,
+    "save_checkpoint": True,
+    "save_checkpoint_epochs": 1,
+    "keep_checkpoint_max": 300,
+    "save_checkpoint_path": "./checkpoint",
+})
+
+config_lsq_ascend_quant = ed({
+    "num_classes": 1000,
+    "image_height": 224,
+    "image_width": 224,
+    "batch_size": 192,
+    "data_load_mode": "mindata",
+    "epoch_size": 40,
+    "start_epoch": 200,
+    "warmup_epochs": 0,
+    "lr": 0.05,
+    "momentum": 0.9,
+    "weight_decay": 4e-5,
+    "label_smooth": 0.1,
+    "loss_scale": 1024,
+    "save_checkpoint": True,
+    "save_checkpoint_epochs": 1,
+    "keep_checkpoint_max": 300,
+    "save_checkpoint_path": "./checkpoint",
+})
+
 def config_quant(device_target):
    if device_target not in ["Ascend", "GPU"]:
        raise ValueError("Unsupported device target: {}.".format(device_target))
--- a/model_zoo/official/cv/mobilenetv2_quant/src/mobilenetv2_mix_quant.py
+++ b/model_zoo/official/cv/mobilenetv2_quant/src/mobilenetv2_mix_quant.py
@ -0,0 +1,414 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""MobileNetV2 model define"""
+import numpy as np
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.ops.operations import Add
+from mindspore import Tensor
+from mindspore.compression.common import QuantDtype
+from mindspore.compression.quant import create_quant_config
+
+__all__ = ['mobilenetv2_mix_quant']
+
+
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class GlobalAvgPooling(nn.Cell):
+    """
+    Global avg pooling definition.
+
+    Args:
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> GlobalAvgPooling()
+    """
+
+    def __init__(self):
+        super(GlobalAvgPooling, self).__init__()
+        self.mean = P.ReduceMean(keep_dims=False)
+
+    def construct(self, x):
+        x = self.mean(x, (2, 3))
+        return x
+
+
+class ConvBNReLU(nn.Cell):
+    """
+    Convolution/Depthwise fused with Batchnorm and ReLU block definition.
+
+    Args:
+        in_planes (int): Input channel.
+        out_planes (int): Output channel.
+        kernel_size (int): Input kernel size.
+        stride (int): Stride size for the first convolutional layer. Default: 1.
+        groups (int): channel group. Convolution is 1 while Depthiwse is input channel. Default: 1.
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> ConvBNReLU(16, 256, kernel_size=1, stride=1, groups=1)
+    """
+
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+        super(ConvBNReLU, self).__init__()
+        padding = (kernel_size - 1) // 2
+        self.conv = nn.Conv2dBnAct(in_planes, out_planes, kernel_size,
+                                   stride=stride,
+                                   pad_mode='pad',
+                                   padding=padding,
+                                   group=groups,
+                                   has_bn=True,
+                                   activation='relu6')
+
+    def construct(self, x):
+        output = self.conv(x)
+        return output
+
+quant_config = create_quant_config(per_channel=(True, False), symmetric=(True, True), narrow_range=(True, True),
+                                   mode="LEARNED_SCALE")
+class FirstQuantLayer(nn.Cell):
+    """
+    The first quantization layer, which is fixed to 8bit.
+    """
+
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1):
+        super(FirstQuantLayer, self).__init__()
+        padding = (kernel_size - 1) // 2
+        in_channels = in_planes
+        out_channels = out_planes
+        conv_inner = nn.Conv2dBnFoldQuantOneConv(in_channels,
+                                                 out_channels,
+                                                 kernel_size=kernel_size,
+                                                 stride=stride,
+                                                 pad_mode='pad',
+                                                 padding=padding,
+                                                 quant_config=quant_config,
+                                                 quant_dtype=QuantDtype.INT8)
+        activation = nn.ActQuant(activation=nn.ReLU6(),
+                                 quant_config=quant_config,
+                                 quant_dtype=QuantDtype.INT8)
+        self.features = nn.SequentialCell([conv_inner, activation])
+
+    def construct(self, x):
+        output = self.features(x)
+        return output
+
+class LastQuantLayer(nn.Cell):
+    """
+    The last quantization layer, which is fixed to 8bit.
+    """
+
+    def __init__(self, in_channels, out_channels, has_bias, has_bn):
+        super(LastQuantLayer, self).__init__()
+
+        self.dense_inner = nn.DenseQuant(in_channels,
+                                         out_channels,
+                                         has_bias=has_bias,
+                                         quant_config=quant_config,
+                                         quant_dtype=QuantDtype.INT8)
+        self.fake_quant_act = nn.FakeQuantWithMinMaxObserver(min_init=-16,
+                                                             max_init=16,
+                                                             ema=True,
+                                                             quant_dtype=QuantDtype.INT8,
+                                                             per_channel=False,
+                                                             symmetric=True,
+                                                             narrow_range=True,
+                                                             mode="LEARNED_SCALE")
+
+    def construct(self, x):
+        output = self.dense_inner(x)
+        output = self.fake_quant_act(output)
+        return output
+
+class InvertedResidual(nn.Cell):
+    """
+    Mobilenetv2 residual block definition.
+
+    Args:
+        inp (int): Input channel.
+        oup (int): Output channel.
+        stride (int): Stride size for the first convolutional layer. Default: 1.
+        expand_ratio (int): expand ration of input channel
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> ResidualBlock(3, 256, 1, 1)
+    """
+
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([
+            # dw
+            ConvBNReLU(hidden_dim, hidden_dim,
+                       stride=stride, groups=hidden_dim),
+            # pw-linear
+            nn.Conv2dBnAct(hidden_dim, oup, kernel_size=1, stride=1, has_bn=True)
+        ])
+        self.conv = nn.SequentialCell(layers)
+        self.add = Add()
+        self.cast = P.Cast()
+
+    def construct(self, x):
+        identity = x
+        x = self.conv(x)
+        if self.use_res_connect:
+            return self.add(identity, x)
+        return x
+
+
+class MobileNetV2Backbone(nn.Cell):
+    """
+    MobileNetV2 architecture.
+
+    Args:
+        class_num (int): number of classes.
+        width_mult (int): Channels multiplier for round to 8/16 and others. Default is 1.
+        has_dropout (bool): Is dropout used. Default is false
+        inverted_residual_setting (list): Inverted residual settings. Default is None
+        round_nearest (list): Channel round to . Default is 8
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> MobileNetV2(num_classes=1000)
+    """
+
+    def __init__(self, width_mult=1., inverted_residual_setting=None, round_nearest=8,
+                 input_channel=32, last_channel=1280):
+        super(MobileNetV2Backbone, self).__init__()
+        block = InvertedResidual
+        # setting of inverted residual blocks
+        self.cfgs = inverted_residual_setting
+        if inverted_residual_setting is None:
+            self.cfgs = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        self.out_channels = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+        features = [FirstQuantLayer(3, input_channel, stride=2)]
+        # building inverted residual blocks
+        for t, c, n, s in self.cfgs:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        features.append(ConvBNReLU(input_channel, self.out_channels, kernel_size=1))
+        # make it nn.CellList
+        self.features = nn.SequentialCell(features)
+        self._initialize_weights()
+
+    def construct(self, x):
+        x = self.features(x)
+        return x
+
+    def _initialize_weights(self):
+        """
+        Initialize weights.
+
+        Args:
+
+        Returns:
+            None.
+
+        Examples:
+            >>> _initialize_weights()
+        """
+        self.init_parameters_data()
+        for _, m in self.cells_and_names():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.set_data(Tensor(np.random.normal(0, np.sqrt(2. / n),
+                                                          m.weight.data.shape).astype("float32")))
+                if m.bias is not None:
+                    m.bias.set_data(
+                        Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.gamma.set_data(
+                    Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
+                m.beta.set_data(
+                    Tensor(np.zeros(m.beta.data.shape, dtype="float32")))
+
+    @property
+    def get_features(self):
+        return self.features
+
+
+class MobileNetV2Head(nn.Cell):
+    """
+    MobileNetV2 architecture.
+
+    Args:
+        class_num (int): Number of classes. Default is 1000.
+        has_dropout (bool): Is dropout used. Default is false
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> MobileNetV2(num_classes=1000)
+    """
+
+    def __init__(self, input_channel=1280, num_classes=1000, has_dropout=False, activation="None"):
+        super(MobileNetV2Head, self).__init__()
+        # mobilenet head
+        head = ([GlobalAvgPooling(), LastQuantLayer(input_channel, num_classes, has_bias=True, has_bn=False)]
+                if not has_dropout else
+                [GlobalAvgPooling(), nn.Dropout(0.2), LastQuantLayer(input_channel, num_classes,
+                                                                     has_bias=True, has_bn=False)])
+        self.head = nn.SequentialCell(head)
+        self.need_activation = True
+        if activation == "Sigmoid":
+            self.activation = P.Sigmoid()
+        elif activation == "Softmax":
+            self.activation = P.Softmax()
+        else:
+            self.need_activation = False
+        self._initialize_weights()
+
+    def construct(self, x):
+        x = self.head(x)
+        if self.need_activation:
+            x = self.activation(x)
+        return x
+
+    def _initialize_weights(self):
+        """
+        Initialize weights.
+
+        Args:
+
+        Returns:
+            None.
+
+        Examples:
+            >>> _initialize_weights()
+        """
+        self.init_parameters_data()
+        for _, m in self.cells_and_names():
+            if isinstance(m, nn.Dense):
+                m.weight.set_data(Tensor(np.random.normal(
+                    0, 0.01, m.weight.data.shape).astype("float32")))
+                if m.bias is not None:
+                    m.bias.set_data(
+                        Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
+    @property
+    def get_head(self):
+        return self.head
+
+
+class MobileNetV2(nn.Cell):
+    """
+    MobileNetV2 architecture.
+
+    Args:
+        class_num (int): number of classes.
+        width_mult (int): Channels multiplier for round to 8/16 and others. Default is 1.
+        has_dropout (bool): Is dropout used. Default is false
+        inverted_residual_setting (list): Inverted residual settings. Default is None
+        round_nearest (list): Channel round to . Default is 8
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> MobileNetV2(backbone, head)
+    """
+
+    def __init__(self, num_classes=1000, width_mult=1., has_dropout=False, inverted_residual_setting=None, \
+        round_nearest=8, input_channel=32, last_channel=1280):
+        super(MobileNetV2, self).__init__()
+        self.backbone = MobileNetV2Backbone(width_mult=width_mult, \
+            inverted_residual_setting=inverted_residual_setting, \
+            round_nearest=round_nearest, input_channel=input_channel, last_channel=last_channel).get_features
+        self.head = MobileNetV2Head(input_channel=self.backbone.out_channel, num_classes=num_classes, \
+            has_dropout=has_dropout).get_head
+
+    def construct(self, x):
+        x = self.backbone(x)
+        x = self.head(x)
+        return x
+
+
+class MobileNetV2Combine(nn.Cell):
+    """
+    MobileNetV2Combine architecture.
+
+    Args:
+        backbone (Cell): the features extract layers.
+        head (Cell):  the fully connected layers.
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> MobileNetV2Combine(backbone, head)
+    """
+
+    def __init__(self, backbone, head):
+        super(MobileNetV2Combine, self).__init__(auto_prefix=False)
+        self.backbone = backbone
+        self.head = head
+
+    def construct(self, x):
+        x = self.backbone(x)
+        x = self.head(x)
+        return x
+
+
+def mobilenet_v2(backbone, head):
+    return MobileNetV2Combine(backbone, head)
+
+
+def mobilenetv2_mix_quant(num_classes):
+    """
+    MobileNetV2 quantization model, the first and last layers are fixed to 8bit by manual quantization cell,
+    and others `QuantDtype` are seted by the `QuantizationAwareTraining` API.
+    """
+    backbone_net = MobileNetV2Backbone()
+    head_net = MobileNetV2Head(input_channel=backbone_net.out_channels,
+                               num_classes=num_classes)
+    net = mobilenet_v2(backbone_net, head_net)
+    return net
--- a/model_zoo/official/cv/mobilenetv2_quant/train.py
+++ b/model_zoo/official/cv/mobilenetv2_quant/train.py
@ -26,15 +26,18 @@ from mindspore.train.loss_scale_manager import FixedLossScaleManager
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
 from mindspore.train.serialization import load_checkpoint
 from mindspore.communication.management import init, get_group_size, get_rank
+from mindspore.compression.common import QuantDtype
 from mindspore.compression.quant import QuantizationAwareTraining
+from mindspore.compression.quant.quantizer import OptimizeOption
 from mindspore.compression.quant.quant_utils import load_nonquant_param_into_quant_net
 from mindspore.common import set_seed

 from src.dataset import create_dataset
 from src.lr_generator import get_lr
 from src.utils import Monitor, CrossEntropyWithLabelSmooth
-from src.config import config_ascend_quant, config_gpu_quant
+from src.config import config_ascend_quant, config_gpu_quant, config_lsq_ascend_quant, config_lsq_gpu_quant
 from src.mobilenetV2 import mobilenetV2
+from src.mobilenetv2_mix_quant import mobilenetv2_mix_quant

 set_seed(1)

@ -42,6 +45,8 @@ parser = argparse.ArgumentParser(description='Image classification')
 parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
 parser.add_argument('--pre_trained', type=str, default=None, help='Pertained checkpoint path')
 parser.add_argument('--device_target', type=str, default=None, help='Run device target')
+parser.add_argument('--optim_option', type=str, default="QAT", help='If OptimizeOption is set to LEARNED_SCALE,'
+                                                                    'the learned scale quant process is executed.')
 args_opt = parser.parse_args()

 if args_opt.device_target == "Ascend":
@ -66,7 +71,11 @@ else:


 def train_on_ascend():
-    config = config_ascend_quant
+    if args_opt.optim_option == "LEARNED_SCALE":
+        config = config_lsq_ascend_quant
+    else:
+        config = config_ascend_quant
+
    print("training args: {}".format(args_opt))
    print("training configure: {}".format(config))
    print("parallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size))
@ -80,7 +89,10 @@ def train_on_ascend():
        init()

    # define network
-    network = mobilenetV2(num_classes=config.num_classes)
+    if args_opt.optim_option == "LEARNED_SCALE":
+        network = mobilenetv2_mix_quant(num_classes=config.num_classes)
+    else:
+        network = mobilenetV2(num_classes=config.num_classes)
    # define loss
    if config.label_smooth > 0:
        loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes)
@ -99,10 +111,22 @@ def train_on_ascend():
        param_dict = load_checkpoint(args_opt.pre_trained)
        load_nonquant_param_into_quant_net(network, param_dict)
    # convert fusion network to quantization aware network
-    quantizer = QuantizationAwareTraining(bn_fold=True,
-                                          per_channel=[True, False],
-                                          symmetric=[True, False],
-                                          one_conv_fold=True)
+    if args_opt.optim_option == "LEARNED_SCALE":
+        quant_optim_otions = OptimizeOption.LEARNED_SCALE
+        quantizer = QuantizationAwareTraining(bn_fold=True,
+                                              per_channel=[True, False],
+                                              symmetric=[True, True],
+                                              narrow_range=[True, True],
+                                              quant_dtype=(QuantDtype.INT4, QuantDtype.INT8),
+                                              freeze_bn=0,
+                                              quant_delay=0,
+                                              one_conv_fold=True,
+                                              optimize_option=quant_optim_otions)
+    else:
+        quantizer = QuantizationAwareTraining(bn_fold=True,
+                                              per_channel=[True, False],
+                                              symmetric=[True, False],
+                                              one_conv_fold=True)
    network = quantizer.quantize(network)

    # get learning rate
@ -136,12 +160,19 @@ def train_on_ascend():


 def train_on_gpu():
-    config = config_gpu_quant
+    if args_opt.optim_option == "LEARNED_SCALE":
+        config = config_lsq_gpu_quant
+    else:
+        config = config_gpu_quant
+
    print("training args: {}".format(args_opt))
    print("training configure: {}".format(config))

    # define network
-    network = mobilenetV2(num_classes=config.num_classes)
+    if args_opt.optim_option == "LEARNED_SCALE":
+        network = mobilenetv2_mix_quant(num_classes=config.num_classes)
+    else:
+        network = mobilenetV2(num_classes=config.num_classes)
    # define loss
    if config.label_smooth > 0:
        loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth,
@ -163,11 +194,23 @@ def train_on_gpu():
        load_nonquant_param_into_quant_net(network, param_dict)

    # convert fusion network to quantization aware network
-    quantizer = QuantizationAwareTraining(bn_fold=True,
-                                          per_channel=[True, False],
-                                          symmetric=[False, False],
-                                          freeze_bn=1000000,
-                                          quant_delay=step_size * 2)
+    if args_opt.optim_option == "LEARNED_SCALE":
+        quant_optim_otions = OptimizeOption.LEARNED_SCALE
+        quantizer = QuantizationAwareTraining(bn_fold=True,
+                                              per_channel=[True, False],
+                                              symmetric=[True, True],
+                                              narrow_range=[True, True],
+                                              quant_dtype=(QuantDtype.INT4, QuantDtype.INT8),
+                                              freeze_bn=0,
+                                              quant_delay=0,
+                                              one_conv_fold=True,
+                                              optimize_option=quant_optim_otions)
+    else:
+        quantizer = QuantizationAwareTraining(bn_fold=True,
+                                              per_channel=[True, False],
+                                              symmetric=[False, False],
+                                              freeze_bn=1000000,
+                                              quant_delay=step_size * 2)
    network = quantizer.quantize(network)

    # get learning rate
--- a/tests/st/quantization/lenet_quant/test_lenet_quant.py
+++ b/tests/st/quantization/lenet_quant/test_lenet_quant.py
@ -27,6 +27,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMoni
 from mindspore import load_checkpoint, load_param_into_net, export
 from mindspore.train import Model
 from mindspore.compression.quant import QuantizationAwareTraining
+from mindspore.compression.quant.quantizer import OptimizeOption
 from mindspore.compression.quant.quant_utils import load_nonquant_param_into_quant_net
 from dataset import create_dataset
 from config import nonquant_cfg, quant_cfg
@ -58,7 +59,30 @@ def train_lenet():
                dataset_sink_mode=True)


-def train_lenet_quant():
+def eval_lenet():
+    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
+    cfg = nonquant_cfg
+    ds_eval = create_dataset(os.path.join(data_path, "test"), cfg.batch_size, 1)
+    ckpt_path = './ckpt_lenet_noquant-10_1875.ckpt'
+    # define fusion network
+    network = LeNet5(cfg.num_classes)
+    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
+    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
+    # call back and monitor
+    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
+    # load quantization aware network checkpoint
+    param_dict = load_checkpoint(ckpt_path)
+    not_load_param = load_param_into_net(network, param_dict)
+    if not_load_param:
+        raise ValueError("Load param into net fail!")
+
+    print("============== Starting Testing ==============")
+    acc = model.eval(ds_eval, dataset_sink_mode=True)
+    print("============== {} ==============".format(acc))
+    assert acc['Accuracy'] > 0.98
+
+
+def train_lenet_quant(optim_option="QAT"):
    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
    cfg = quant_cfg
    ckpt_path = './ckpt_lenet_noquant-10_1875.ckpt'
@ -73,10 +97,21 @@ def train_lenet_quant():
    load_nonquant_param_into_quant_net(network, param_dict)

    # convert fusion network to quantization aware network
-    quantizer = QuantizationAwareTraining(quant_delay=900,
-                                          bn_fold=False,
-                                          per_channel=[True, False],
-                                          symmetric=[True, False])
+    if optim_option == "LEARNED_SCALE":
+        quant_optim_otions = OptimizeOption.LEARNED_SCALE
+        quantizer = QuantizationAwareTraining(bn_fold=False,
+                                              per_channel=[True, False],
+                                              symmetric=[True, True],
+                                              narrow_range=[True, True],
+                                              freeze_bn=0,
+                                              quant_delay=0,
+                                              one_conv_fold=True,
+                                              optimize_option=quant_optim_otions)
+    else:
+        quantizer = QuantizationAwareTraining(quant_delay=900,
+                                              bn_fold=False,
+                                              per_channel=[True, False],
+                                              symmetric=[True, False])
    network = quantizer.quantize(network)

    # define network loss
@ -87,7 +122,7 @@ def train_lenet_quant():
    # call back and monitor
    config_ckpt = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size,
                                   keep_checkpoint_max=cfg.keep_checkpoint_max)
-    ckpt_callback = ModelCheckpoint(prefix="ckpt_lenet_quant", config=config_ckpt)
+    ckpt_callback = ModelCheckpoint(prefix="ckpt_lenet_quant"+optim_option, config=config_ckpt)

    # define model
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
@ -98,19 +133,30 @@ def train_lenet_quant():
    print("============== End Training ==============")


-def eval_quant():
+def eval_quant(optim_option="QAT"):
    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
    cfg = quant_cfg
    ds_eval = create_dataset(os.path.join(data_path, "test"), cfg.batch_size, 1)
-    ckpt_path = './ckpt_lenet_quant-10_937.ckpt'
+    ckpt_path = './ckpt_lenet_quant'+optim_option+'-10_937.ckpt'
    # define fusion network
    network = LeNet5Fusion(cfg.num_classes)
    # convert fusion network to quantization aware network
-    quantizer = QuantizationAwareTraining(quant_delay=0,
-                                          bn_fold=False,
-                                          freeze_bn=10000,
-                                          per_channel=[True, False],
-                                          symmetric=[True, False])
+    if optim_option == "LEARNED_SCALE":
+        quant_optim_otions = OptimizeOption.LEARNED_SCALE
+        quantizer = QuantizationAwareTraining(bn_fold=False,
+                                              per_channel=[True, False],
+                                              symmetric=[True, True],
+                                              narrow_range=[True, True],
+                                              freeze_bn=0,
+                                              quant_delay=0,
+                                              one_conv_fold=True,
+                                              optimize_option=quant_optim_otions)
+    else:
+        quantizer = QuantizationAwareTraining(quant_delay=0,
+                                              bn_fold=False,
+                                              freeze_bn=10000,
+                                              per_channel=[True, False],
+                                              symmetric=[True, False])
    network = quantizer.quantize(network)

    # define loss
@ -132,17 +178,29 @@ def eval_quant():
    print("============== {} ==============".format(acc))
    assert acc['Accuracy'] > 0.98

-def export_lenet():
+
+def export_lenet(optim_option="QAT"):
    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
    cfg = quant_cfg
    # define fusion network
    network = LeNet5Fusion(cfg.num_classes)
    # convert fusion network to quantization aware network
-    quantizer = QuantizationAwareTraining(quant_delay=0,
-                                          bn_fold=False,
-                                          freeze_bn=10000,
-                                          per_channel=[True, False],
-                                          symmetric=[True, False])
+    if optim_option == "LEARNED_SCALE":
+        quant_optim_otions = OptimizeOption.LEARNED_SCALE
+        quantizer = QuantizationAwareTraining(bn_fold=False,
+                                              per_channel=[True, False],
+                                              symmetric=[True, True],
+                                              narrow_range=[True, True],
+                                              freeze_bn=0,
+                                              quant_delay=0,
+                                              one_conv_fold=True,
+                                              optimize_option=quant_optim_otions)
+    else:
+        quantizer = QuantizationAwareTraining(quant_delay=0,
+                                              bn_fold=False,
+                                              freeze_bn=10000,
+                                              per_channel=[True, False],
+                                              symmetric=[True, False])
    network = quantizer.quantize(network)

    # export network
@ -155,9 +213,13 @@ def export_lenet():
@pytest.mark.env_onecard
 def test_lenet_quant():
    train_lenet()
+    eval_lenet()
    train_lenet_quant()
    eval_quant()
    export_lenet()
+    train_lenet_quant(optim_option="LEARNED_SCALE")
+    eval_quant(optim_option="LEARNED_SCALE")
+    export_lenet(optim_option="LEARNED_SCALE")


 if __name__ == "__main__":
--- a/tests/ut/python/train/quant/test_quant.py
+++ b/tests/ut/python/train/quant/test_quant.py
@ -21,6 +21,7 @@ from mindspore import Tensor
 from mindspore import nn
 from mindspore.compression.quant import QuantizationAwareTraining
 from mindspore.compression.export import quant_export
+from mindspore.compression.quant.quantizer import OptimizeOption
 from model_zoo.official.cv.mobilenetv2_quant.src.mobilenetV2 import mobilenetV2

 context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
@ -99,3 +100,56 @@ def test_qat_mobile_per_channel_ff():
    # should load the checkpoint. mock here
    network.init_parameters_data()
    quant_export.export(network, img, file_name="quant.pb")
+
+
+@pytest.mark.skip(reason="no `te.lang.cce` in ut env")
+def test_lsq_lenet():
+    img = Tensor(np.ones((32, 1, 32, 32)).astype(np.float32))
+    net = LeNet5()
+    quantizer = QuantizationAwareTraining(bn_fold=True,
+                                          per_channel=[True, False],
+                                          symmetric=[True, True],
+                                          narrow_range=[True, True],
+                                          freeze_bn=0,
+                                          quant_delay=0,
+                                          one_conv_fold=True,
+                                          optimize_option=OptimizeOption.LEARNED_SCALE)
+    net = quantizer.quantize(net)
+    # should load the checkpoint. mock here
+    net.init_parameters_data()
+    quant_export.export(net, img, file_name="quant.pb")
+
+
+@pytest.mark.skip(reason="no `te.lang.cce` in ut env")
+def test_lsq_mobile_per_channel_tf():
+    network = mobilenetV2(num_classes=1000)
+    img = Tensor(np.ones((1, 3, 224, 224)).astype(np.float32))
+    quantizer = QuantizationAwareTraining(bn_fold=True,
+                                          per_channel=[True, False],
+                                          symmetric=[True, True],
+                                          narrow_range=[True, True],
+                                          freeze_bn=0,
+                                          quant_delay=0,
+                                          one_conv_fold=True,
+                                          optimize_option=OptimizeOption.LEARNED_SCALE)
+    network = quantizer.quantize(network)
+    # should load the checkpoint. mock here
+    network.init_parameters_data()
+    quant_export.export(network, img, file_name="quant.pb")
+
+@pytest.mark.skip(reason="no `te.lang.cce` in ut env")
+def test_lsq_mobile_per_channel_ff():
+    network = mobilenetV2(num_classes=1000)
+    img = Tensor(np.ones((1, 3, 224, 224)).astype(np.float32))
+    quantizer = QuantizationAwareTraining(bn_fold=True,
+                                          per_channel=[False, False],
+                                          symmetric=[True, True],
+                                          narrow_range=[True, True],
+                                          freeze_bn=0,
+                                          quant_delay=0,
+                                          one_conv_fold=True,
+                                          optimize_option=OptimizeOption.LEARNED_SCALE)
+    network = quantizer.quantize(network)
+    # should load the checkpoint. mock here
+    network.init_parameters_data()
+    quant_export.export(network, img, file_name="quant.pb")