From c43b41a378bde434e02aa1aaf80eb6fe2b5a43d8 Mon Sep 17 00:00:00 2001
From: Haim Moushkatel <haim.moushkatel@huawei.com>
Date: Wed, 22 Dec 2021 17:05:24 +0200
Subject: [PATCH] Cropper Support for ToD

---
 cmake/package_lite.cmake                      |   4 +
 .../cpu/nnacl/activation_parameter.h          |  28 ++++
 .../cpu/nnacl/fp32/activation_fp32.h          |   9 +-
 .../cpu/nnacl/fp32_grad/batch_norm.h          |   7 +-
 .../nnacl/fp32_grad/batch_norm_parameter.h    |  27 ++++
 .../cpu/nnacl/fp32_grad/pack_ext.c            |   1 -
 .../cpu/nnacl/fp32_grad/resize_grad.h         |  14 +-
 .../nnacl/fp32_grad/resize_grad_parameter.h   |  34 +++++
 .../softmax_crossentropy_parameter.h          |  36 ++++++
 .../cpu/nnacl/fp32_grad/softmax_grad.c        |   1 -
 .../cpu/nnacl/fp32_grad/softmax_grad.h        |  16 +--
 mindspore/lite/src/inner_kernel.h             |   2 +
 .../runtime/kernel/arm/fp32/batchnorm_fp32.cc |  10 ++
 .../runtime/kernel/arm/fp32/batchnorm_fp32.h  |   1 +
 .../src/train/train_populate_parameter.cc     |  10 +-
 .../src/train/train_populate_parameter_v0.cc  |   9 +-
 mindspore/lite/src/train/train_session.cc     |  20 +--
 mindspore/lite/src/train/transfer_session.cc  |  21 +--
 .../tools/cropper/build_cropper_config.sh     | 121 ++++++++++++------
 19 files changed, 253 insertions(+), 118 deletions(-)
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/activation_parameter.h
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm_parameter.h
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/resize_grad_parameter.h
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_crossentropy_parameter.h

diff --git a/cmake/package_lite.cmake b/cmake/package_lite.cmake
index 8a581ba5cc4..838b613e369 100644
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@@ -599,6 +599,10 @@ else()
                 DESTINATION ${CROPPER_ROOT_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
         install(FILES ${TOP_DIR}/mindspore/lite/build/tools/cropper/cropper_mapping_npu.cfg
                 DESTINATION ${CROPPER_ROOT_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        if(SUPPORT_TRAIN)
+            install(FILES ${TOP_DIR}/mindspore/lite/build/tools/cropper/cropper_mapping_cpu_train.cfg
+                    DESTINATION ${CROPPER_ROOT_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        endif()
     endif()
 endif()
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/activation_parameter.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/activation_parameter.h
new file mode 100644
index 00000000000..2b5bae55930
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/activation_parameter.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_ACTIVATION_PARAMETER_H_
+#define MINDSPORE_NNACL_ACTIVATION_PARAMETER_H_
+
+#include "nnacl/op_base.h"
+typedef struct ActivationParameter {
+  OpParameter op_parameter_;
+  int type_;
+  float alpha_;
+  float min_val_;
+  float max_val_;
+} ActivationParameter;
+
+#endif  // MINDSPORE_NNACL_ACTIVATION_PARAMETER_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.h
index 4090da2a294..288d88755dd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.h
@@ -19,14 +19,7 @@
 #include <math.h>
 #include "nnacl/op_base.h"
 #include "nnacl/int8/fixed_point.h"
-
-typedef struct ActivationParameter {
-  OpParameter op_parameter_;
-  int type_;
-  float alpha_;
-  float min_val_;
-  float max_val_;
-} ActivationParameter;
+#include "nnacl/activation_parameter.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm.h
index 7cdf3f0ee48..2e31b13e4c3 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm.h
@@ -17,12 +17,7 @@
 #ifndef MINDSPORE_NNACL_FP32_GRAD_BATCH_NORM_H_
 #define MINDSPORE_NNACL_FP32_GRAD_BATCH_NORM_H_
 
-#include "nnacl/op_base.h"
-
-typedef struct BNGradParameter {
-  OpParameter op_parameter_;
-  float epsilon_;
-} BNGradParameter;
+#include "nnacl/fp32_grad/batch_norm_parameter.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm_parameter.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm_parameter.h
new file mode 100644
index 00000000000..b51acdf5ac9
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm_parameter.h
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_GRAD_BATCH_NORM_PARAMATER_H_
+#define MINDSPORE_NNACL_FP32_GRAD_BATCH_NORM_PARAMATER_H_
+
+#include "nnacl/op_base.h"
+
+typedef struct BNGradParameter {
+  OpParameter op_parameter_;
+  float epsilon_;
+} BNGradParameter;
+
+#endif  // MINDSPORE_NNACL_FP32_GRAD_BATCH_NORM_PARAMATER_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/pack_ext.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/pack_ext.c
index 75032fb17c0..bc1113b80d8 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/pack_ext.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/pack_ext.c
@@ -16,7 +16,6 @@
 
 #include <string.h>
 #include "nnacl/fp32_grad/pack_ext.h"
-#include "nnacl/pack.h"
 
 void RollingIm2ColPackDwUnitFp32(const float *in_data, const ConvParameter *conv_param, float *data_col_orig,
                                  int real_cal_num, int start) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/resize_grad.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/resize_grad.h
index 77d8cd6de36..6a21610b488 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/resize_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/resize_grad.h
@@ -17,24 +17,12 @@
 #ifndef MINDSPORE_NNACL_FP32_GRAD_RESIZE_GRAD_H_
 #define MINDSPORE_NNACL_FP32_GRAD_RESIZE_GRAD_H_
 
-#include "nnacl/op_base.h"
+#include "nnacl/fp32_grad/resize_grad_parameter.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef struct ResizeGradParameter {
-  OpParameter op_parameter_;
-  bool align_corners_;
-  int method;
-  size_t in_height_;
-  size_t in_width_;
-  size_t out_height_;
-  size_t out_width_;
-  float height_scale_;
-  float width_scale_;
-} ResizeGradParameter;
-
 int ResizeNearestNeighborGrad(const float *in_addr, float *out_addr, int batch_size, int channel, int format,
                               const ResizeGradParameter *param);
 int ResizeBiLinearGrad(const float *in_addr, float *out_addr, int batch_size, int channel, int format,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/resize_grad_parameter.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/resize_grad_parameter.h
new file mode 100644
index 00000000000..b3991dfe69f
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/resize_grad_parameter.h
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_GRAD_RESIZE_PARAMETER_GRAD_H_
+#define MINDSPORE_NNACL_FP32_GRAD_RESIZE_PARAMETER_GRAD_H_
+
+#include "nnacl/op_base.h"
+
+typedef struct ResizeGradParameter {
+  OpParameter op_parameter_;
+  bool align_corners_;
+  int method;
+  size_t in_height_;
+  size_t in_width_;
+  size_t out_height_;
+  size_t out_width_;
+  float height_scale_;
+  float width_scale_;
+} ResizeGradParameter;
+
+#endif  //  MINDSPORE_NNACL_FP32_GRAD_RESIZE_PARAMETER_GRAD_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_crossentropy_parameter.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_crossentropy_parameter.h
new file mode 100644
index 00000000000..4ba26e3afaf
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_crossentropy_parameter.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_GRAD_SOFTMAX_CROSSENTROPY_PARAMETER_H_
+#define MINDSPORE_NNACL_FP32_GRAD_SOFTMAX_CROSSENTROPY_PARAMETER_H_
+
+#include "nnacl/op_base.h"
+
+typedef struct SoftmaxCrossEntropyParameter {
+  // primitive parameter
+  OpParameter op_parameter_;
+  int n_dim_;
+
+  // shape correlative
+  int input_shape_[5];
+
+  // other parameter
+  int32_t batch_size_;
+  unsigned int number_of_classes_;
+  bool is_grad_;
+} SoftmaxCrossEntropyParameter;
+
+#endif  // MINDSPORE_NNACL_FP32_GRAD_SOFTMAX_CROSSENTROPY_PARAMETER_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_grad.c
index d2e7cb53aa4..4886bd596d6 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_grad.c
@@ -16,7 +16,6 @@
 
 #include "nnacl/fp32_grad/softmax_grad.h"
 #include <string.h>
-#include "nnacl/fp32_grad/gemm.h"
 
 void SoftmaxGrad(const float *input_ptr, const float *yt_ptr, float *output_ptr, float *sum_data, float *sum_mul,
                  const SoftmaxParameter *parameter) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_grad.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_grad.h
index 005a2d47baa..46465b301e7 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_grad.h
@@ -17,27 +17,13 @@
 #ifndef MINDSPORE_NNACL_FP32_GRAD_SOFTMAX_GRAD_H_
 #define MINDSPORE_NNACL_FP32_GRAD_SOFTMAX_GRAD_H_
 
-#include "nnacl/op_base.h"
 #include "nnacl/fp32/softmax_fp32.h"
+#include "nnacl/fp32_grad/softmax_crossentropy_parameter.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef struct SoftmaxCrossEntropyParameter {
-  // primitive parameter
-  OpParameter op_parameter_;
-  int n_dim_;
-
-  // shape correlative
-  int input_shape_[5];
-
-  // other parameter
-  int32_t batch_size_;
-  unsigned int number_of_classes_;
-  bool is_grad_;
-} SoftmaxCrossEntropyParameter;
-
 void SoftmaxGrad(const float *input_ptr, const float *yt_ptr, float *output_ptr, float *sum_data, float *sum_mul,
                  const SoftmaxParameter *parameter);
 #ifdef __cplusplus
diff --git a/mindspore/lite/src/inner_kernel.h b/mindspore/lite/src/inner_kernel.h
index b1767a1d8e3..2794a962a60 100644
--- a/mindspore/lite/src/inner_kernel.h
+++ b/mindspore/lite/src/inner_kernel.h
@@ -159,6 +159,8 @@ class InnerKernel : public Kernel {
     return mindspore::lite::RET_OK;
   }
 
+  virtual int SetupVirtualBatch(int virtual_batch_multiplier, int param) { return mindspore::lite::RET_OK; }
+
   virtual bool IsEval() const { return !this->train_mode_; }
 
   virtual void SetTrainable(bool trainable = true) { this->trainable_ = trainable; }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.cc
index ed3dccd9474..8298732a5a4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.cc
@@ -141,5 +141,15 @@ int BatchnormCPUKernel::RestoreDefaultMomentum() {
   return RET_OK;
 }
 
+int BatchnormCPUKernel::SetupVirtualBatch(int virtual_batch_multiplier, int param) {
+  if ((virtual_batch_multiplier > 0)) {
+    int momentum = (param < 0.0f) ? (this->get_momentum() / virtual_batch_multiplier) : param;
+    return this->set_momentum(momentum);
+  } else {
+    return this->RestoreDefaultMomentum();
+  }
+  return RET_OK;
+}
+
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_BatchNorm, LiteKernelCreator<BatchnormCPUKernel>)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.h
index db550e291ca..6f0aadce4a1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.h
@@ -36,6 +36,7 @@ class BatchnormCPUKernel : public InnerKernel {
   int Prepare() override;
   int ReSize() override;
   int Run() override;
+  int SetupVirtualBatch(int virtual_batch_multiplier, int param) override;
   virtual int InitConstTensor();
   virtual int DoExecute(int task_id);
   virtual int set_momentum(float momentum);
diff --git a/mindspore/lite/src/train/train_populate_parameter.cc b/mindspore/lite/src/train/train_populate_parameter.cc
index 9b48f72ba54..15800e62745 100644
--- a/mindspore/lite/src/train/train_populate_parameter.cc
+++ b/mindspore/lite/src/train/train_populate_parameter.cc
@@ -17,19 +17,19 @@
 #include <algorithm>
 #include "src/ops/populate/populate_register.h"
 #include "src/ops/populate/default_populate.h"
-#include "src/ops/populate/strided_slice_populate.h"
+#include "nnacl/strided_slice_parameter.h"
 #include "nnacl/arithmetic.h"
 #include "nnacl/conv_parameter.h"
 #include "nnacl/lstm_parameter.h"
 #include "nnacl/pooling_parameter.h"
 #include "nnacl/power_parameter.h"
-#include "nnacl/fp32/activation_fp32.h"
-#include "nnacl/fp32_grad/softmax_grad.h"
+#include "nnacl/activation_parameter.h"
+#include "nnacl/fp32_grad/softmax_crossentropy_parameter.h"
 #include "nnacl/fp32_grad/optimizer.h"
-#include "nnacl/fp32_grad/batch_norm.h"
+#include "nnacl/fp32_grad/batch_norm_parameter.h"
 #include "nnacl/fp32_grad/dropout_parameter.h"
 #include "nnacl/fp32_grad/smooth_l1_loss.h"
-#include "nnacl/fp32_grad/resize_grad.h"
+#include "nnacl/fp32_grad/resize_grad_parameter.h"
 
 using mindspore::lite::Registry;
 
diff --git a/mindspore/lite/src/train/train_populate_parameter_v0.cc b/mindspore/lite/src/train/train_populate_parameter_v0.cc
index aa52e773b11..d3329399c75 100644
--- a/mindspore/lite/src/train/train_populate_parameter_v0.cc
+++ b/mindspore/lite/src/train/train_populate_parameter_v0.cc
@@ -19,18 +19,15 @@
 #include "src/ops/populate/populate_register.h"
 #include "schema/model_v0_generated.h"
 #include "nnacl/pooling_parameter.h"
-#include "nnacl/fp32_grad/softmax_grad.h"
-#include "nnacl/fp32/activation_fp32.h"
+#include "nnacl/fp32_grad/softmax_crossentropy_parameter.h"
+#include "nnacl/activation_parameter.h"
 #include "nnacl/conv_parameter.h"
 #include "nnacl/power_parameter.h"
 #include "nnacl/arithmetic.h"
 #include "nnacl/fp32_grad/optimizer.h"
-#include "nnacl/fp32_grad/batch_norm.h"
+#include "nnacl/fp32_grad/batch_norm_parameter.h"
 #include "nnacl/fp32_grad/dropout_parameter.h"
 #include "nnacl/fp32_grad/smooth_l1_loss.h"
-#include "nnacl/infer/conv2d_grad_filter_infer.h"
-#include "nnacl/infer/conv2d_grad_input_infer.h"
-#include "nnacl/infer/group_conv2d_grad_input_infer.h"
 
 namespace mindspore::kernel {
 namespace {
diff --git a/mindspore/lite/src/train/train_session.cc b/mindspore/lite/src/train/train_session.cc
index e68fe03b778..b4cef76f459 100644
--- a/mindspore/lite/src/train/train_session.cc
+++ b/mindspore/lite/src/train/train_session.cc
@@ -26,18 +26,13 @@
 #include <map>
 #include <set>
 #include "include/errorcode.h"
-#include "src/executor.h"
 #include "src/lite_model.h"
 #include "src/lite_kernel_util.h"
-#include "src/sub_graph_kernel.h"
 #include "src/tensor.h"
 #include "src/kernel_registry.h"
 #include "src/common/prim_util.h"
 #include "src/common/tensor_util.h"
 #include "src/common/utils.h"
-#include "src/runtime/kernel/arm/fp32_grad/convolution.h"
-#include "src/runtime/kernel/arm/fp32/batchnorm_fp32.h"
-#include "src/train/loss_kernel.h"
 #include "src/train/optimizer_kernel.h"
 #include "src/train/train_utils.h"
 #include "src/train/train_export.h"
@@ -890,8 +885,9 @@ int TrainSession::ApplyGradients(const std::vector<tensor::MSTensor *> &gradient
       if (current_gradient->tensor_name() == gradient->tensor_name()) {
         found = true;
         if (current_gradient->Size() == gradient->Size()) {
-          std::copy(static_cast<char *>(gradient->data()), static_cast<char *>(gradient->data()) + gradient->Size(),
-                    static_cast<char *>(current_gradient->MutableData()));
+          std::copy(static_cast<uint8_t *>(gradient->data()),
+                    static_cast<uint8_t *>(gradient->data()) + gradient->Size(),
+                    static_cast<uint8_t *>(current_gradient->MutableData()));
         } else {
           MS_LOG(ERROR) << "gradient tensor " << gradient->tensor_name() << " has wrong size " << gradient->Size()
                         << " instead of " << current_gradient->Size();
@@ -954,14 +950,8 @@ int TrainSession::AdminSetupVirtualBatch(int virtual_batch_multiplier, float lr,
     }
 
     if (IsBN(kernel) && kernel->IsTrainable()) {
-      auto batchnorm = static_cast<kernel::BatchnormCPUKernel *>(kernel->kernel());
-      auto ret = RET_OK;
-      if (mod == kernel::WeightUpdateMode::VIRTUAL_BATCH) {
-        momentum = (momentum < 0.0f) ? (batchnorm->get_momentum() / virtual_batch_multiplier_) : momentum;
-        ret = batchnorm->set_momentum(momentum);
-      } else {
-        ret = batchnorm->RestoreDefaultMomentum();
-      }
+      auto batchnorm = static_cast<kernel::InnerKernel *>(kernel->kernel());
+      auto ret = batchnorm->SetupVirtualBatch(virtual_batch_multiplier_, momentum);
       if (ret != RET_OK) {
         MS_LOG(ERROR) << kernel->name() << " failed to set momentum";
         return RET_ERROR;
diff --git a/mindspore/lite/src/train/transfer_session.cc b/mindspore/lite/src/train/transfer_session.cc
index 28532cfeb43..97cecafff9b 100644
--- a/mindspore/lite/src/train/transfer_session.cc
+++ b/mindspore/lite/src/train/transfer_session.cc
@@ -28,12 +28,8 @@
 #include "src/tensor.h"
 #include "src/train/loss_kernel.h"
 #include "src/train/optimizer_kernel.h"
-#include "src/sub_graph_kernel.h"
 #include "src/train/train_populate_parameter.h"
 #include "src/executor.h"
-#include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/fp32_grad/convolution.h"
-#include "nnacl/fp32/pack_fp32.h"
 #include "src/train/train_export.h"
 #include "src/train/train_utils.h"
 
@@ -141,13 +137,22 @@ int TransferSession::RunGraph(const KernelCallBack &before, const KernelCallBack
   for (auto &backbone_head_pair : backbone_head_map_) {
     auto input = backbone_head_pair.first;
     auto output = backbone_head_pair.second;
-    char *input_data = reinterpret_cast<char *>(input->MutableData());
-    char *output_data = reinterpret_cast<char *>(output->MutableData());
+    float *input_data = reinterpret_cast<float *>(input->MutableData());
+    float *output_data = reinterpret_cast<float *>(output->MutableData());
     if (nchw2nhwc_) {
-      int plane = input->shape().at(1) * input->shape().at(2);
       int batch = input->shape().at(0);
+      int plane = input->shape().at(1) * input->shape().at(2);
       int channel = input->shape().at(3);
-      PackNCHWToNHWCFp32(output_data, input_data, batch, plane, channel, 0, 1);
+      int img_size = plane * channel;
+      for (int b = 0; b < batch; b++) {
+        float *in = input_data + b * img_size;
+        float *out = output_data + b * img_size;
+        for (int p = 0; p < plane; p++) {
+          for (int c = 0; c < channel; c++) {
+            in[p * channel + c] = out[c * plane + p];
+          }
+        }
+      }
     } else {
       std::copy(output_data, output_data + output->Size(), input_data);
     }
diff --git a/mindspore/lite/tools/cropper/build_cropper_config.sh b/mindspore/lite/tools/cropper/build_cropper_config.sh
index aed5fec9e71..b82626051aa 100644
--- a/mindspore/lite/tools/cropper/build_cropper_config.sh
+++ b/mindspore/lite/tools/cropper/build_cropper_config.sh
@@ -7,13 +7,18 @@ cd "${MINDSPORE_HOME}" || exit 1
 CROPPER_OUTPUT_DIR=mindspore/lite/build/tools/cropper
 mkdir -p ${CROPPER_OUTPUT_DIR}
 MAPPING_OUTPUT_FILE_NAME_TMP=${CROPPER_OUTPUT_DIR}/cropper_mapping_tmp.cfg
+MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP=${CROPPER_OUTPUT_DIR}/cropper_mapping_train_tmp.cfg
 CPU_MAPPING_OUTPUT_FILE=${CROPPER_OUTPUT_DIR}/cropper_mapping_cpu.cfg
 GPU_MAPPING_OUTPUT_FILE=${CROPPER_OUTPUT_DIR}/cropper_mapping_gpu.cfg
 NPU_MAPPING_OUTPUT_FILE=${CROPPER_OUTPUT_DIR}/cropper_mapping_npu.cfg
+CPU_TRAIN_MAPPING_OUTPUT_FILE=${CROPPER_OUTPUT_DIR}/cropper_mapping_cpu_train.cfg
 [ -n "${MAPPING_OUTPUT_FILE_NAME_TMP}" ] && rm -f ${MAPPING_OUTPUT_FILE_NAME_TMP}
+[ -n "${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP}" ] && rm -f ${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP}
 [ -n "${CPU_MAPPING_OUTPUT_FILE}" ] && rm -f ${CPU_MAPPING_OUTPUT_FILE}
 [ -n "${GPU_MAPPING_OUTPUT_FILE}" ] && rm -f ${GPU_MAPPING_OUTPUT_FILE}
 [ -n "${NPU_MAPPING_OUTPUT_FILE}" ] && rm -f ${NPU_MAPPING_OUTPUT_FILE}
+[ -n "${CPU_TRAIN_MAPPING_OUTPUT_FILE}" ] && rm -f ${CPU_TRAIN_MAPPING_OUTPUT_FILE}
+
 ops_list=()
 DEFINE_STR="-DENABLE_ANDROID -DENABLE_ARM -DENABLE_ARM64 -DENABLE_NEON -DNO_DLIB -DUSE_ANDROID_LOG -DANDROID -DENABLE_FP16"
 # get the flatbuffers path
@@ -57,11 +62,17 @@ getDeep() {
     # only add existing files
     if [[ -e ${array_deep_file%h*}cc ]]; then
       file_split=$(echo ${array_deep_file} | awk -F '/' '{print $NF}')
-      echo "${1},${3},${file_split%h*}cc.o" >>${MAPPING_OUTPUT_FILE_NAME_TMP}
+      if [[ "$4" != "train_source" ]] ; then
+        echo "${1},${3},${file_split%h*}cc.o" >>${MAPPING_OUTPUT_FILE_NAME_TMP}
+      fi
+      echo "${1},${3},${file_split%h*}cc.o" >>${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP} 
     fi
     if [[ -e ${array_deep_file%h*}c ]]; then
       file_split=$(echo ${array_deep_file} | awk -F '/' '{print $NF}')
-      echo "${1},${3},${file_split%h*}c.o" >>${MAPPING_OUTPUT_FILE_NAME_TMP}
+      if [[ "$4" != "train_source" ]] ; then
+        echo "${1},${3},${file_split%h*}c.o" >>${MAPPING_OUTPUT_FILE_NAME_TMP}
+      fi
+      echo "${1},${3},${file_split%h*}c.o" >>${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP}
     fi
   done
 }
@@ -79,6 +90,7 @@ getOpsFile() {
       out_file=$(echo ${file} | awk -F '/' '{print $NF}')
       # concat schemaType + fileType + fileName append to files
       echo "${type},${3},${out_file}.o" >>${MAPPING_OUTPUT_FILE_NAME_TMP}
+      echo "${type},${3},${out_file}.o" >>${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP}
       map_files=$(gcc -MM ${file} ${DEFINE_STR} ${HEADER_LOCATION})
       # first is *.o second is *.cc
       array_file=()
@@ -91,18 +103,44 @@ getOpsFile() {
           getDeep ${type} ${array_file} ${3} &
           array_file_split=$(echo ${array_file} | awk -F '/' '{print $NF}')
           echo "${type},${3},${array_file_split%h*}cc.o" >>${MAPPING_OUTPUT_FILE_NAME_TMP}
+          echo "${type},${3},${array_file_split%h*}cc.o" >>${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP}
         fi
         if [[ -e ${array_file%h*}c ]]; then
           getDeep ${type} ${array_file%h*}c ${3} &
           getDeep ${type} ${array_file} ${3} &
           array_file_split=$(echo ${array_file} | awk -F '/' '{print $NF}')
           echo "${type},${3},${array_file_split%h*}c.o" >>${MAPPING_OUTPUT_FILE_NAME_TMP}
+          echo "${type},${3},${array_file_split%h*}c.o" >>${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP}
         fi
       done
     done
   done
 }
 
+getFilesFromArr() {
+  local -n arr_files=${1}
+  # echo " func parm 1 : ${arr_files[@]}"
+  # echo " func parm 2 : $2"
+  # shellcheck disable=SC2068
+  for file in ${arr_files[@]}; do
+    map_files=$(gcc -MM ${file} ${DEFINE_STR} ${HEADER_LOCATION})
+    # first is *.o second is *.cc
+    # shellcheck disable=SC2207
+    array_runtime=($(echo ${map_files} | awk -F '\' '{for(i=3;i<=NF;i++){print $i}}' | grep -v "flatbuffers" | egrep -v ${REMOVE_LISTS_STR}))
+    # only add existing files
+    for array_runtime_file in "${array_runtime[@]}"; do
+      if [[ -e ${array_runtime_file%h*}cc && ! ${all_files[*]} =~ ${array_runtime_file%h*}cc ]]; then
+        all_files=("${all_files[@]}" "${array_runtime_file%h*}cc")
+        getDeep "CommonFile" ${array_runtime_file%h*}cc "common" $2 &
+      fi
+      if [[ -e ${array_runtime_file%h*}c && ! ${all_files[*]} =~ ${array_runtime_file%h*}c ]]; then
+        all_files=("${all_files[@]}" "${array_runtime_file%h*}c")
+        getDeep "CommonFile" ${array_runtime_file%h*}c "common" $2 &
+      fi
+    done
+  done  
+}
+
 getCommonFile() {
   echo "start get common files"
   include_h=()
@@ -115,6 +153,9 @@ getCommonFile() {
   while IFS='' read -r line; do common_files_h+=("$line"); done < <(ls mindspore/lite/src/common/*.h)
   runtime_files_h=()
   while IFS='' read -r line; do runtime_files_h+=("$line"); done < <(ls mindspore/lite/src/runtime/*.h)
+  train_files_h=()
+  while IFS='' read -r line; do train_files_h+=("$line"); done < <(ls mindspore/lite/include/train/*.h)
+  while IFS='' read -r line; do train_files_h+=("$line"); done < <(ls mindspore/lite/src/train/*.h)
   others_files_h=(
     mindspore/lite/src/runtime/infer_manager.h
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.h
@@ -130,7 +171,9 @@ getCommonFile() {
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/tensor_c.h
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/errorcode.h
   )
-  all_files_h=("${include_h[@]}" "${regist_include_h[@]}" "${src_files_h[@]}" "${common_files_h[@]}" "${runtime_files_h[@]}" "${others_files_h[@]}")
+  all_files_h=("${include_h[@]}" "${regist_include_h[@]}" "${src_files_h[@]}" "${common_files_h[@]}"
+               "${runtime_files_h[@]}" "${others_files_h[@]}"
+  )
 
   # concat regx
   REMOVE_LISTS_STR="${all_files_h[0]}"
@@ -167,51 +210,43 @@ getCommonFile() {
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.c
     mindspore/core/utils/status.cc
   )
+  # save train files
+  train_files=()
+  while IFS='' read -r line; do train_files+=("$line"); done < <(ls mindspore/lite/src/train/*.cc)
+  while IFS='' read -r line; do train_files+=("$line"); done < <(ls mindspore/lite/src/cxx_api/callback/*.cc)
+  while IFS='' read -r line; do train_files+=("$line"); done < <(ls mindspore/lite/src/cxx_api/metrics/*.cc)
+  while IFS='' read -r line; do train_files+=("$line"); done < <(ls mindspore/lite/src/cxx_api/train/*.cc)
+  others_train_files=(
+    mindspore/lite/tools/common/storage.cc
+  )
   all_files=("${src_files[@]}" "${regist_files[@]}" "${common_files[@]}" "${runtime_files_cc[@]}"
     "${others_files_c[@]}" "${assembly_files[@]}" "${mindrt_files[@]}"
     "${cxx_api_files[@]}"
-  )
-  # shellcheck disable=SC2068
-  for file in ${all_files[@]}; do
-    map_files=$(gcc -MM ${file} ${DEFINE_STR} ${HEADER_LOCATION})
-    # first is *.o second is *.cc
-    # shellcheck disable=SC2207
-    array_runtime=($(echo ${map_files} | awk -F '\' '{for(i=3;i<=NF;i++){print $i}}' | grep -v "flatbuffers" | egrep -v ${REMOVE_LISTS_STR}))
-    # only add existing files
-    for array_runtime_file in "${array_runtime[@]}"; do
-      if [[ -e ${array_runtime_file%h*}cc && ! ${all_files[*]} =~ ${array_runtime_file%h*}cc ]]; then
-        all_files=("${all_files[@]}" "${array_runtime_file%h*}cc")
-        getDeep "CommonFile" ${array_runtime_file%h*}cc "common" &
-      fi
-      if [[ -e ${array_runtime_file%h*}c && ! ${all_files[*]} =~ ${array_runtime_file%h*}c ]]; then
-        all_files=("${all_files[@]}" "${array_runtime_file%h*}c")
-        getDeep "CommonFile" ${array_runtime_file%h*}c "common" &
-      fi
-    done
-  done
-  # shellcheck disable=SC2068
-  for file in ${all_files_h[@]}; do
-    map_files=$(gcc -MM ${file} ${DEFINE_STR} ${HEADER_LOCATION})
-    # first is *.o second is *.cc
-    # shellcheck disable=SC2207
-    array_runtime=($(echo ${map_files} | awk -F '\' '{for(i=3;i<=NF;i++){print $i}}' | grep -v "flatbuffers" | egrep -v ${REMOVE_LISTS_STR}))
-    # only add existing files
-    for array_runtime_file in "${array_runtime[@]}"; do
-      if [[ -e ${array_runtime_file%h*}cc && ! ${all_files[*]} =~ ${array_runtime_file%h*}cc ]]; then
-        all_files=("${all_files[@]}" "${array_runtime_file%h*}cc")
-        getDeep "CommonFile" ${array_runtime_file%h*}cc "common" &
-      fi
-      if [[ -e ${array_runtime_file%h*}c && ! ${all_files[*]} =~ ${array_runtime_file%h*}c ]]; then
-        all_files=("${all_files[@]}" "${array_runtime_file%h*}c")
-        getDeep "CommonFile" ${array_runtime_file%h*}c "common" &
-      fi
-    done
-  done
+  ) 
+  getFilesFromArr all_files
+  getFilesFromArr all_files_h
   # shellcheck disable=SC2068
   for file in ${all_files[@]}; do
     file=$(echo ${file} | awk -F '/' '{print $NF}')
     echo "CommonFile,common,${file}.o" >>${MAPPING_OUTPUT_FILE_NAME_TMP}
   done
+  
+  all_files_train=("${all_files[@]}" "${train_files[@]}" "${others_train_files[@]}"
+  )
+  all_files_train_h=("${all_files_h[@]}" "${train_files_h[@]}"
+  )
+  REMOVE_LISTS_STR="${all_files_train_h[0]}"
+  # shellcheck disable=SC2068
+  for val in ${all_files_train_h[@]:1}; do
+    REMOVE_LISTS_STR="$REMOVE_LISTS_STR|$val"
+  done
+  getFilesFromArr all_files_train "train_source"
+  getFilesFromArr all_files_train_h "train_source"
+  # shellcheck disable=SC2068
+  for file in ${all_files_train[@]}; do
+    file=$(echo ${file} | awk -F '/' '{print $NF}')
+    echo "CommonFile,common,${file}.o" >>${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP}
+  done
 }
 
 # The x86 platform cannot search based on header files, so manually search for the first layer.
@@ -276,6 +311,11 @@ sleep 1
 sort ${MAPPING_OUTPUT_FILE_NAME_TMP} | uniq >${CPU_MAPPING_OUTPUT_FILE}
 chmod 444 ${CPU_MAPPING_OUTPUT_FILE}
 
+sleep 1
+# remove duplicate files
+sort ${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP} | uniq >${CPU_TRAIN_MAPPING_OUTPUT_FILE}
+chmod 444 ${CPU_TRAIN_MAPPING_OUTPUT_FILE}
+
 # support for gpu
 opencl_files=()
 while IFS='' read -r line; do opencl_files+=("$line"); done < <(ls mindspore/lite/src/runtime/kernel/opencl/*.cc)
@@ -320,4 +360,5 @@ chmod 444 ${NPU_MAPPING_OUTPUT_FILE}
 
 # modify file permissions to read-only
 [ -n "${MAPPING_OUTPUT_FILE_NAME_TMP}" ] && rm -f ${MAPPING_OUTPUT_FILE_NAME_TMP}
+[ -n "${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP}" ] && rm -f ${MAPPING_OUTPUT_FILE_NAME_TRAIN_TMP}
 echo "Complete all tasks."