!22706 [BETA]sponge ops: lj, pme, simple constrain

Merge pull request !22706 from jiahongQian/master0831
2021-09-02 02:10:46 +00:00 · 2021-09-02 02:10:46 +00:00 · 7d8ffd6a69
parent 68cd54f8f3 6db5f4064e
commit 7d8ffd6a69
40 changed files with 2769 additions and 240 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cu
@ -0,0 +1,144 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /**
+ *Note:
+ *  LJForce. This is an experimental interface that is subject to change and/or deletion.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
+
+__global__ void LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy_CUDA(
+  const int atom_numbers, const NEIGHBOR_LIST *nl, const UINT_VECTOR_LJ_TYPE *uint_crd, const VECTOR *boxlength,
+  const float *LJ_type_A, const float *LJ_type_B, const float cutoff, VECTOR *frc, const float pme_beta,
+  const float sqrt_pi, float *atom_lj_virial, float *atom_direct_cf_energy) {
+  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (atom_i < atom_numbers) {
+    NEIGHBOR_LIST nl_i = nl[atom_i];
+    int N = nl_i.atom_numbers;
+    int atom_j;
+    int int_x;
+    int int_y;
+    int int_z;
+    UINT_VECTOR_LJ_TYPE r1 = uint_crd[atom_i], r2;
+    VECTOR dr;
+    float dr_2;
+    float dr_4;
+    float dr_8;
+    float dr_6;
+    float frc_abs = 0.;
+    VECTOR frc_lin;
+    VECTOR frc_record = {0., 0., 0.};
+    float charge_i = r1.charge;
+    float charge_j;
+    float dr_abs;
+    float dr_1;
+    float beta_dr;
+    float frc_cf_abs;
+    float virial_lin = 0.;
+    float energy_lin = 0.;
+
+    int x, y;
+    int atom_pair_LJ_type;
+    for (int j = threadIdx.y; j < N; j = j + blockDim.y) {
+      atom_j = nl_i.atom_serial[j];
+      r2 = uint_crd[atom_j];
+      charge_j = r2.charge;
+
+      int_x = r2.uint_x - r1.uint_x;
+      int_y = r2.uint_y - r1.uint_y;
+      int_z = r2.uint_z - r1.uint_z;
+      dr.x = boxlength[0].x * int_x;
+      dr.y = boxlength[0].y * int_y;
+      dr.z = boxlength[0].z * int_z;
+      dr_abs = norm3df(dr.x, dr.y, dr.z);
+      if (dr_abs < cutoff) {
+        dr_1 = 1. / dr_abs;
+        dr_2 = dr_1 * dr_1;
+        dr_4 = dr_2 * dr_2;
+        dr_8 = dr_4 * dr_4;
+        dr_6 = dr_4 * dr_2;
+
+        y = (r2.LJ_type - r1.LJ_type);
+        x = y >> 31;
+        y = (y ^ x) - x;
+        x = r2.LJ_type + r1.LJ_type;
+        r2.LJ_type = (x + y) >> 1;
+        x = (x - y) >> 1;
+        atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;
+
+        frc_abs = (-LJ_type_A[atom_pair_LJ_type] * dr_6 + LJ_type_B[atom_pair_LJ_type]) * dr_8;
+        beta_dr = pme_beta * dr_abs;
+        frc_cf_abs = beta_dr * sqrt_pi * expf(-beta_dr * beta_dr) + erfcf(beta_dr);
+        frc_cf_abs = frc_cf_abs * dr_2 * dr_1;
+        frc_cf_abs = charge_i * charge_j * frc_cf_abs;
+        energy_lin = energy_lin + charge_i * charge_j * erfcf(beta_dr) * dr_1;
+        virial_lin = virial_lin - frc_abs * dr_abs * dr_abs;
+        frc_abs = frc_abs - frc_cf_abs;
+        frc_lin.x = frc_abs * dr.x;
+        frc_lin.y = frc_abs * dr.y;
+        frc_lin.z = frc_abs * dr.z;
+        frc_record.x = frc_record.x + frc_lin.x;
+        frc_record.y = frc_record.y + frc_lin.y;
+        frc_record.z = frc_record.z + frc_lin.z;
+        atomicAdd(&frc[atom_j].x, -frc_lin.x);
+        atomicAdd(&frc[atom_j].y, -frc_lin.y);
+        atomicAdd(&frc[atom_j].z, -frc_lin.z);
+      }
+    }
+    atomicAdd(&frc[atom_i].x, frc_record.x);
+    atomicAdd(&frc[atom_i].y, frc_record.y);
+    atomicAdd(&frc[atom_i].z, frc_record.z);
+
+    atomicAdd(&atom_direct_cf_energy[atom_i], energy_lin);
+    atomicAdd(&atom_lj_virial[atom_i], virial_lin);
+  }
+}
+
+void LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy(
+  const int atom_numbers, const float cutoff, const float pme_beta, const unsigned int *uint_crd_f, const int *LJtype,
+  const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers, int *nl_atom_serial,
+  int *nl, const float *d_LJ_A, const float *d_LJ_B, float *frc_f, float *atom_lj_virial, float *atom_energy,
+  int max_neighbor_numbers, cudaStream_t stream) {
+  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(3 * atom_numbers, frc_f, 0.);
+  Reset_List<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(atom_numbers, atom_energy, 0.);
+  Reset_List<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(atom_numbers, atom_lj_virial, 0.);
+  VECTOR *frc = reinterpret_cast<VECTOR *>(frc_f);
+  VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f));
+  NEIGHBOR_LIST *nl_a = reinterpret_cast<NEIGHBOR_LIST *>(nl);
+  construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
+    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl_a);
+
+  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ_a = reinterpret_cast<UINT_VECTOR_LJ_TYPE *>(uint_crd_with_LJ);
+
+  UNSIGNED_INT_VECTOR *uint_crd =
+    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
+
+  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
+    atom_numbers, uint_crd, uint_crd_with_LJ_a, LJtype, charge);
+
+  LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy_CUDA<<<ceilf(static_cast<float>(atom_numbers) / 8), thread_LJ, 0,
+                                                            stream>>>(
+    atom_numbers, nl_a, uint_crd_with_LJ_a, scaler, d_LJ_A, d_LJ_B, cutoff, frc, pme_beta, TWO_DIVIDED_BY_SQRT_PI,
+    atom_lj_virial, atom_energy);
+  return;
+}
+
+void LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy(
+  const int atom_numbers, const float cutoff, const float pme_beta, const unsigned int *uint_crd_f, const int *LJtype,
+  const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers, int *nl_atom_serial,
+  int *nl, const float *d_LJ_A, const float *d_LJ_B, float *frc_f, float *atom_lj_virial, float *atom_energy,
+  int max_neighbor_numbers, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh
@ -0,0 +1,33 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  LJForce. This is an experimental interface that is subject to change and/or deletion.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LJ_DIRECT_CF_FORCE_WITH_LJ_VIRIAL_DIRECT_CF_ENERGY_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_LJ_DIRECT_CF_FORCE_WITH_LJ_VIRIAL_DIRECT_CF_ENERGY_IMPL_H_
+
+#include <curand_kernel.h>
+#include "runtime/device/gpu/cuda_common.h"
+
+void LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy(
+  const int atom_numbers, const float cutoff, const float pme_beta, const unsigned int *uint_crd_f, const int *LJtype,
+  const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers, int *nl_atom_serial,
+  int *nl, const float *d_LJ_A, const float *d_LJ_B, float *frc_f, float *atom_lj_virial, float *atom_energy,
+  int max_neighbor_numbers, cudaStream_t stream);
+
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cu
@ -13,14 +13,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  MDIterationLeapFrog. This is an experimental interface that is subject to change and/or deletion.
+ */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

-__global__ void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, const float half_dt, const float dt,
-                                                    const float exp_gamma, const float *inverse_mass,
-                                                    const float *sqrt_mass_inverse, VECTOR *vel, VECTOR *crd,
-                                                    VECTOR *frc, VECTOR *acc, VECTOR *random_frc) {
+__global__ void MD_Iteration_Leap_Frog(const int atom_numbers, VECTOR *vel, VECTOR *crd, VECTOR *frc, VECTOR *acc,
+                                       const float *inverse_mass, const float dt) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < atom_numbers) {
    acc[i].x = inverse_mass[i] * frc[i].x;
@ -31,17 +33,9 @@ __global__ void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, cons
    vel[i].y = vel[i].y + dt * acc[i].y;
    vel[i].z = vel[i].z + dt * acc[i].z;

-    crd[i].x = crd[i].x + half_dt * vel[i].x;
-    crd[i].y = crd[i].y + half_dt * vel[i].y;
-    crd[i].z = crd[i].z + half_dt * vel[i].z;
-
-    vel[i].x = exp_gamma * vel[i].x + sqrt_mass_inverse[i] * random_frc[i].x;
-    vel[i].y = exp_gamma * vel[i].y + sqrt_mass_inverse[i] * random_frc[i].y;
-    vel[i].z = exp_gamma * vel[i].z + sqrt_mass_inverse[i] * random_frc[i].z;
-
-    crd[i].x = crd[i].x + half_dt * vel[i].x;
-    crd[i].y = crd[i].y + half_dt * vel[i].y;
-    crd[i].z = crd[i].z + half_dt * vel[i].z;
+    crd[i].x = crd[i].x + dt * vel[i].x;
+    crd[i].y = crd[i].y + dt * vel[i].y;
+    crd[i].z = crd[i].z + dt * vel[i].z;

    frc[i].x = 0.;
    frc[i].y = 0.;
@ -49,88 +43,12 @@ __global__ void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, cons
  }
 }

-__global__ void MD_Iteration_Leap_Frog_With_LiuJian_With_Max_Velocity(const int atom_numbers, const float half_dt,
-                                                                      const float dt, const float exp_gamma,
-                                                                      const float *inverse_mass,
-                                                                      const float *sqrt_mass_inverse, VECTOR *vel,
-                                                                      VECTOR *crd, VECTOR *frc, VECTOR *acc,
-                                                                      VECTOR *random_frc, const float max_vel) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  float abs_vel;
-  if (i < atom_numbers) {
-    acc[i].x = inverse_mass[i] * frc[i].x;
-    acc[i].y = inverse_mass[i] * frc[i].y;
-    acc[i].z = inverse_mass[i] * frc[i].z;
-
-    vel[i].x = vel[i].x + dt * acc[i].x;
-    vel[i].y = vel[i].y + dt * acc[i].y;
-    vel[i].z = vel[i].z + dt * acc[i].z;
-
-    abs_vel = norm3df(vel[i].x, vel[i].y, vel[i].z);
-    if (abs_vel < max_vel) {
-    } else {
-      abs_vel = max_vel / abs_vel;
-      vel[i].x = abs_vel * vel[i].x;
-      vel[i].y = abs_vel * vel[i].y;
-      vel[i].z = abs_vel * vel[i].z;
-    }
-
-    crd[i].x = crd[i].x + half_dt * vel[i].x;
-    crd[i].y = crd[i].y + half_dt * vel[i].y;
-    crd[i].z = crd[i].z + half_dt * vel[i].z;
-
-    vel[i].x = exp_gamma * vel[i].x + sqrt_mass_inverse[i] * random_frc[i].x;
-    vel[i].y = exp_gamma * vel[i].y + sqrt_mass_inverse[i] * random_frc[i].y;
-    vel[i].z = exp_gamma * vel[i].z + sqrt_mass_inverse[i] * random_frc[i].z;
-
-    crd[i].x = crd[i].x + half_dt * vel[i].x;
-    crd[i].y = crd[i].y + half_dt * vel[i].y;
-    crd[i].z = crd[i].z + half_dt * vel[i].z;
-
-    frc[i].x = 0.;
-    frc[i].y = 0.;
-    frc[i].z = 0.;
-  }
+void MDIterationLeapFrog(const int atom_numbers, float *vel, float *crd, float *frc, float *acc,
+                         const float *inverse_mass, const float dt, cudaStream_t stream) {
+  VECTOR *d_vel = reinterpret_cast<VECTOR *>(vel);
+  VECTOR *d_crd = reinterpret_cast<VECTOR *>(crd);
+  VECTOR *d_frc = reinterpret_cast<VECTOR *>(frc);
+  VECTOR *d_acc = reinterpret_cast<VECTOR *>(acc);
+  MD_Iteration_Leap_Frog<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
+    atom_numbers, d_vel, d_crd, d_frc, d_acc, inverse_mass, dt);
 }
-
-void MDIterationLeapFrog(const int float4_numbers, const int atom_numbers, const float half_dt, const float dt,
-                         const float exp_gamma, const int is_max_velocity, const float max_velocity,
-                         const float *d_mass_inverse, const float *d_sqrt_mass, float *vel_f, float *crd_f,
-                         float *frc_f, float *acc_f, cudaStream_t stream) {
-  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(3 * atom_numbers, acc_f, 0.);
-
-  VECTOR *frc = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(frc_f));
-  VECTOR *vel = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(vel_f));
-  VECTOR *acc = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(acc_f));
-  VECTOR *crd = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(crd_f));
-
-  curandStatePhilox4_32_10_t *rand_state;
-  VECTOR *random_force;
-
-  Cuda_Malloc_Safely(reinterpret_cast<void **>(&random_force), sizeof(float4) * float4_numbers);
-  Cuda_Malloc_Safely(reinterpret_cast<void **>(&rand_state), sizeof(curandStatePhilox4_32_10_t) * float4_numbers);
-  Setup_Rand_Normal_Kernel<<<ceilf(static_cast<float>(float4_numbers) / 32.), 32>>>(float4_numbers, rand_state, 1);
-  Rand_Normal<<<ceilf(static_cast<float>(float4_numbers) / 32.), 32, 0, stream>>>(
-    float4_numbers, rand_state, reinterpret_cast<float4 *>(random_force));
-
-  if (!is_max_velocity) {
-    MD_Iteration_Leap_Frog_With_LiuJian<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
-      atom_numbers, half_dt, dt, exp_gamma, d_mass_inverse, d_sqrt_mass, vel, crd, frc, acc, random_force);
-  } else {
-    MD_Iteration_Leap_Frog_With_LiuJian_With_Max_Velocity<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0,
-                                                            stream>>>(atom_numbers, half_dt, dt, exp_gamma,
-                                                                      d_mass_inverse, d_sqrt_mass, vel, crd, frc, acc,
-                                                                      random_force, max_velocity);
-
-    cudaStreamSynchronize(stream);
-    cudaFree(random_force);
-    cudaFree(rand_state);
-
-    return;
-  }
-}
-
-void MDIterationLeapFrog(const int float4_numbers, const int atom_numbers, const float half_dt, const float dt,
-                         const float exp_gamma, const int is_max_velocity, const float max_velocity,
-                         const float *d_mass_inverse, const float *d_sqrt_mass, float *vel_f, float *crd_f,
-                         float *frc_f, float *acc_f, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh
@ -13,15 +13,18 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  MDIterationLeapFrog. This is an experimental interface that is subject to change and/or deletion.
+ */
+
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NVTIT_MD_ITERATION_LEAP_FROG_IMPL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NVTIT_MD_ITERATION_LEAP_FROG_IMPL_H

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

-void MDIterationLeapFrog(const int float4_numbers, const int atom_numbers, const float half_dt, const float dt,
-                         const float exp_gamma, const int is_max_velocity, const float max_velocity,
-                         const float *d_mass_inverse, const float *d_sqrt_mass, float *vel_f, float *crd_f,
-                         float *frc_f, float *acc_f, cudaStream_t stream);
+void MDIterationLeapFrog(const int atom_numbers, float *vel, float *crd, float *frc, float *acc,
+                         const float *inverse_mass, const float dt, cudaStream_t stream);

-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NVTIT_MD_ITERATION_LEAP_FROG_IMPL_H
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_common.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_common.cuh
@ -13,6 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  PME_Common. This is an experimental interface that is subject to change and/or deletion.
+ */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_COMMON_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_COMMON_H_
@ -25,7 +29,7 @@ __constant__ float PME_dMa[4] = {0.5, -1.5, 1.5, -0.5};
 __constant__ float PME_dMb[4] = {0, 1, -2, 1};
 __constant__ float PME_dMc[4] = {0, 0.5, 0, -0.5};
 #define PI 3.1415926
-const float periodic_factor_inverse = 2.32830643e-10;
+const float periodic_factor_inverse = 2.3283064365387e-10;
 static dim3 thread_PME;

 const float cutoff = 10.0;
@ -98,6 +102,10 @@ static float getb(int k, int NFFT, int B_order) {
  return res.x * res.x + res.y * res.y;
 }

+__global__ static void device_add(float *ene, float *factor, float *charge_sum) {
+  ene[0] += factor[0] * charge_sum[0] * charge_sum[0];
+}
+
 __global__ static void PME_Atom_Near(const UNSIGNED_INT_VECTOR *uint_crd, int *PME_atom_near, const int PME_Nin,
                                     const float periodic_factor_inverse_x, const float periodic_factor_inverse_y,
                                     const float periodic_factor_inverse_z, const int atom_numbers, const int fftx,
@ -130,10 +138,13 @@ __global__ static void PME_Atom_Near(const UNSIGNED_INT_VECTOR *uint_crd, int *P
        UNSIGNED_INT_VECTOR temp_kxyz = PME_kxyz[k];
        kx = tempux - temp_kxyz.uint_x;
        if (kx < 0) kx += fftx;
+        if (kx > fftx) kx -= fftx;
        ky = tempuy - temp_kxyz.uint_y;
        if (ky < 0) ky += ffty;
+        if (ky > ffty) ky -= ffty;
        kz = tempuz - temp_kxyz.uint_z;
        if (kz < 0) kz += fftz;
+        if (kz > fftz) kz -= fftz;
        temp_near[k] = kx * PME_Nin + ky * fftz + kz;
      }
    }
@ -227,4 +238,120 @@ __global__ static void PME_Direct_Energy(const int atom_numbers, const NEIGHBOR_
  }
 }

+__global__ static void PME_Direct_Atom_Energy(const int atom_numbers, const NEIGHBOR_LIST *nl,
+                                              const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *boxlength,
+                                              const float *charge, const float beta, const float cutoff_square,
+                                              float *direct_ene) {
+  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (atom_i < atom_numbers) {
+    NEIGHBOR_LIST nl_i = nl[atom_i];
+    int N = nl_i.atom_numbers;
+    int atom_j;
+    int int_x;
+    int int_y;
+    int int_z;
+    UNSIGNED_INT_VECTOR r1 = uint_crd[atom_i], r2;
+    VECTOR dr;
+    float dr2;
+    float dr_abs;
+    // float dr_inverse;
+    float ene_temp;
+    float charge_i = charge[atom_i];
+    float ene_lin = 0.;
+
+    for (int j = threadIdx.y; j < N; j = j + blockDim.y) {
+      atom_j = nl_i.atom_serial[j];
+      r2 = uint_crd[atom_j];
+
+      int_x = r2.uint_x - r1.uint_x;
+      int_y = r2.uint_y - r1.uint_y;
+      int_z = r2.uint_z - r1.uint_z;
+      dr.x = boxlength[0].x * int_x;
+      dr.y = boxlength[0].y * int_y;
+      dr.z = boxlength[0].z * int_z;
+
+      dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
+      if (dr2 < cutoff_square) {
+        dr_abs = norm3df(dr.x, dr.y, dr.z);
+        ene_temp = charge_i * charge[atom_j] * erfcf(beta * dr_abs) / dr_abs;
+        ene_lin = ene_lin + ene_temp;
+      }
+    }
+    atomicAdd(&direct_ene[atom_i], ene_lin);
+  }
+}
+
+__global__ static void PME_Energy_Product(const int element_number, const float *list1, const float *list2,
+                                          float *sum) {
+  if (threadIdx.x == 0) {
+    sum[0] = 0.;
+  }
+  __syncthreads();
+  float lin = 0.0;
+  for (int i = threadIdx.x; i < element_number; i = i + blockDim.x) {
+    lin = lin + list1[i] * list2[i];
+  }
+  atomicAdd(sum, lin);
+}
+
+__global__ static void PME_BCFQ(cufftComplex *PME_FQ, float *PME_BC, int PME_Nfft) {
+  int index = blockDim.x * blockIdx.x + threadIdx.x;
+  if (index < PME_Nfft) {
+    float tempf = PME_BC[index];
+    cufftComplex tempc = PME_FQ[index];
+    PME_FQ[index].x = tempc.x * tempf;
+    PME_FQ[index].y = tempc.y * tempf;
+  }
+}
+
+__global__ static void PME_Excluded_Energy_Correction(const int atom_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
+                                                      const VECTOR *sacler, const float *charge, const float pme_beta,
+                                                      const float sqrt_pi, const int *excluded_list_start,
+                                                      const int *excluded_list, const int *excluded_atom_numbers,
+                                                      float *ene) {
+  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (atom_i < atom_numbers) {
+    int excluded_number = excluded_atom_numbers[atom_i];
+    if (excluded_number > 0) {
+      int list_start = excluded_list_start[atom_i];
+      // int atom_min = excluded_list[list_start];
+      int list_end = list_start + excluded_number;
+      int atom_j;
+      int int_x;
+      int int_y;
+      int int_z;
+
+      float charge_i = charge[atom_i];
+      float charge_j;
+      float dr_abs;
+      float beta_dr;
+
+      UNSIGNED_INT_VECTOR r1 = uint_crd[atom_i], r2;
+      VECTOR dr;
+      float dr2;
+
+      float ene_lin = 0.;
+
+      for (int i = list_start; i < list_end; i = i + 1) {
+        atom_j = excluded_list[i];
+        r2 = uint_crd[atom_j];
+        charge_j = charge[atom_j];
+
+        int_x = r2.uint_x - r1.uint_x;
+        int_y = r2.uint_y - r1.uint_y;
+        int_z = r2.uint_z - r1.uint_z;
+        dr.x = sacler[0].x * int_x;
+        dr.y = sacler[0].y * int_y;
+        dr.z = sacler[0].z * int_z;
+        dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
+
+        dr_abs = sqrtf(dr2);
+        beta_dr = pme_beta * dr_abs;
+
+        ene_lin -= charge_i * charge_j * erff(beta_dr) / dr_abs;
+      }
+      atomicAdd(ene, ene_lin);
+    }
+  }
+}
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cu
@ -13,22 +13,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  PMEEnergy. This is an experimental interface that is subject to change and/or deletion.
+ */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_common.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

-__global__ void PME_Energy_Product(const int element_number, const float *list1, const float *list2, float *sum) {
-  if (threadIdx.x == 0) {
-    sum[0] = 0.;
-  }
-  __syncthreads();
-  float lin = 0.0;
-  for (int i = threadIdx.x; i < element_number; i = i + blockDim.x) {
-    lin = lin + list1[i] * list2[i];
-  }
-  atomicAdd(sum, lin);
-}
-
 __global__ void PME_Energy_Reciprocal(const int element_number, const cufftComplex *FQ, const float *BC, float *sum) {
  if (threadIdx.x == 0) {
    sum[0] = 0.;
@ -43,56 +35,6 @@ __global__ void PME_Energy_Reciprocal(const int element_number, const cufftCompl
  atomicAdd(sum, lin);
 }

-__global__ void PME_Excluded_Energy_Correction(const int atom_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
-                                               const VECTOR *sacler, const float *charge, const float pme_beta,
-                                               const float sqrt_pi, const int *excluded_list_start,
-                                               const int *excluded_list, const int *excluded_atom_numbers, float *ene) {
-  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (atom_i < atom_numbers) {
-    int excluded_number = excluded_atom_numbers[atom_i];
-    if (excluded_number > 0) {
-      int list_start = excluded_list_start[atom_i];
-      // int atom_min = excluded_list[list_start];
-      int list_end = list_start + excluded_number;
-      int atom_j;
-      int int_x;
-      int int_y;
-      int int_z;
-
-      float charge_i = charge[atom_i];
-      float charge_j;
-      float dr_abs;
-      float beta_dr;
-
-      UNSIGNED_INT_VECTOR r1 = uint_crd[atom_i], r2;
-      VECTOR dr;
-      float dr2;
-
-      float ene_lin = 0.;
-
-      for (int i = list_start; i < list_end; i = i + 1) {
-        atom_j = excluded_list[i];
-        r2 = uint_crd[atom_j];
-        charge_j = charge[atom_j];
-
-        int_x = r2.uint_x - r1.uint_x;
-        int_y = r2.uint_y - r1.uint_y;
-        int_z = r2.uint_z - r1.uint_z;
-        dr.x = sacler[0].x * int_x;
-        dr.y = sacler[0].y * int_y;
-        dr.z = sacler[0].z * int_z;
-        dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
-
-        dr_abs = sqrtf(dr2);
-        beta_dr = pme_beta * dr_abs;
-
-        ene_lin -= charge_i * charge_j * erff(beta_dr) / dr_abs;
-      }
-      atomicAdd(ene, ene_lin);
-    }
-  }
-}
-
 void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
               float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, const int *uint_crd_f,
               const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *scaler_f,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_update_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_update_impl.cu
@ -0,0 +1,90 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  PMEEnergyUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_update_impl.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_common.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
+
+__global__ void PME_Energy_Reciprocal_update(const int element_number, const cufftComplex *FQ, const float *BC,
+                                             float *sum) {
+  if (threadIdx.x == 0) {
+    sum[0] = 0.;
+  }
+  __syncthreads();
+  float lin = 0.0;
+  cufftComplex FQ_i;
+  for (int i = threadIdx.x; i < element_number; i = i + blockDim.x) {
+    FQ_i = FQ[i];
+    lin = lin + (FQ_i.x * FQ_i.x + FQ_i.y * FQ_i.y) * BC[i];
+  }
+  atomicAdd(sum, lin);
+}
+
+void PMEEnergyUpdate(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
+                     float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
+                     const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
+                     const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
+                     const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene,
+                     float *d_correction_ene, dim3 thread_PME, int PME_Nin, int PME_Nfft, int PME_Nall,
+                     const cufftHandle &PME_plan_r2c, const cufftHandle &PME_plan_c2r, float *neutralizing_factor,
+                     float *charge_sum, int max_neighbor_numbers, cudaStream_t stream) {
+  UNSIGNED_INT_VECTOR *uint_crd =
+    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
+  VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f));
+  // int max_neighbor_numbers = 800;
+  NEIGHBOR_LIST *nl_a = reinterpret_cast<NEIGHBOR_LIST *>(nl);
+  construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
+    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl_a);
+
+  UNSIGNED_INT_VECTOR *PME_uxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_uxyz);
+  UNSIGNED_INT_VECTOR *PME_kxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_kxyz);
+  VECTOR *PME_frxyz = reinterpret_cast<VECTOR *>(pme_frxyz);
+  cufftComplex *PME_FQ = reinterpret_cast<cufftComplex *>(pme_fq);
+
+  Reset_List<<<3 * atom_numbers / 32 + 1, 32, 0, stream>>>(3 * atom_numbers, reinterpret_cast<int *>(PME_uxyz),
+                                                           1 << 30);
+  PME_Atom_Near<<<atom_numbers / 32 + 1, 32, 0, stream>>>(
+    uint_crd, PME_atom_near, PME_Nin, periodic_factor_inverse * fftx, periodic_factor_inverse * ffty,
+    periodic_factor_inverse * fftz, atom_numbers, fftx, ffty, fftz, PME_kxyz, PME_uxyz, PME_frxyz);
+
+  Reset_List<<<PME_Nall / 1024 + 1, 1024, 0, stream>>>(PME_Nall, PME_Q, 0);
+
+  PME_Q_Spread<<<atom_numbers / thread_PME.x + 1, thread_PME, 0, stream>>>(PME_atom_near, charge, PME_frxyz, PME_Q,
+                                                                           PME_kxyz, atom_numbers);
+
+  cufftExecR2C(PME_plan_r2c, reinterpret_cast<float *>(PME_Q), reinterpret_cast<cufftComplex *>(PME_FQ));
+
+  PME_Energy_Reciprocal_update<<<1, 1024, 0, stream>>>(PME_Nfft, PME_FQ, PME_BC, d_reciprocal_ene);
+
+  PME_Energy_Product<<<1, 1024, 0, stream>>>(atom_numbers, charge, charge, d_self_ene);
+  Scale_List<<<1, 1, 0, stream>>>(1, d_self_ene, -beta / sqrtf(PI));
+
+  Sum_Of_List<<<1, 1024>>>(atom_numbers, charge, charge_sum);
+  device_add<<<1, 1>>>(d_self_ene, neutralizing_factor, charge_sum);
+
+  Reset_List<<<1, 1, 0, stream>>>(1, d_direct_ene, 0.0);
+  PME_Direct_Energy<<<atom_numbers / thread_PME.x + 1, thread_PME, 0, stream>>>(
+    atom_numbers, nl_a, uint_crd, scaler, charge, beta, cutoff * cutoff, d_direct_ene);
+
+  Reset_List<<<1, 1, 0, stream>>>(1, d_correction_ene, 0.0);
+  PME_Excluded_Energy_Correction<<<atom_numbers / 32 + 1, 32, 0, stream>>>(
+    atom_numbers, uint_crd, scaler, charge, beta, sqrtf(PI), excluded_list_start, excluded_list, excluded_atom_numbers,
+    d_correction_ene);
+  return;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_update_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_update_impl.cuh
@ -0,0 +1,31 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_UPDATE_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_UPDATE_IMPL_H_
+
+#include <cufft.h>
+#include "runtime/device/gpu/cuda_common.h"
+
+void PMEEnergyUpdate(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
+                     float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
+                     const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
+                     const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
+                     const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene,
+                     float *d_correction_ene, dim3 thread_PME, int PME_Nin, int PME_Nfft, int PME_Nall,
+                     const cufftHandle &PME_plan_r2c, const cufftHandle &PME_plan_c2r, float *neutralizing_factor,
+                     float *charge_sum, int max_neighbor_numbers, cudaStream_t stream);
+
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cu
@ -13,19 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  PMEReciprocalForce. This is an experimental interface that is subject to change and/or deletion.
+ */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_common.cuh"

-__global__ void PME_BCFQ(cufftComplex *PME_FQ, float *PME_BC, int PME_Nfft) {
-  int index = blockDim.x * blockIdx.x + threadIdx.x;
-  if (index < PME_Nfft) {
-    float tempf = PME_BC[index];
-    cufftComplex tempc = PME_FQ[index];
-    PME_FQ[index].x = tempc.x * tempf;
-    PME_FQ[index].y = tempc.y * tempf;
-  }
-}
-
 __global__ void PME_Final(int *PME_atom_near, const float *charge, const float *PME_Q, VECTOR *force,
                          const VECTOR *PME_frxyz, const UNSIGNED_INT_VECTOR *PME_kxyz,
                          const _VECTOR PME_inverse_box_vector, const int atom_numbers) {
@ -92,9 +86,11 @@ void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float be
  // initial end
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(
    3 * atom_numbers, reinterpret_cast<float *>(frc), 0.);
+
  PME_Atom_Near<<<atom_numbers / 32 + 1, 32, 0, stream>>>(
    uint_crd, PME_atom_near, PME_Nin, periodic_factor_inverse * fftx, periodic_factor_inverse * ffty,
    periodic_factor_inverse * fftz, atom_numbers, fftx, ffty, fftz, PME_kxyz, PME_uxyz, PME_frxyz);
+
  Reset_List<<<PME_Nall / 1024 + 1, 1024, 0, stream>>>(PME_Nall, PME_Q, 0);

  PME_Q_Spread<<<atom_numbers / thread_PME.x + 1, thread_PME, 0, stream>>>(PME_atom_near, charge, PME_frxyz, PME_Q,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cu
@ -13,9 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  Constrain_Force_Cycle. This is an experimental interface that is subject to change and/or deletion.
+ */

-#include "backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh"

 __global__ void Constrain_Force_Cycle_Kernel(const int constrain_pair_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
                                             const VECTOR *scaler, const CONSTRAIN_PAIR *constrain_pair,
@ -64,10 +68,10 @@ void Constrain_Force_Cycle(int atom_numbers, int constrain_pair_numbers, const u
  CONSTRAIN_PAIR *constrain_pair = reinterpret_cast<CONSTRAIN_PAIR *>(constrain_pair_f);

  construct_constrain_pair<<<ceilf(static_cast<float>(constrain_pair_numbers) / 128), 128, 0, stream>>>(
-    constrain_pair_numbers, atom_i_serials, atom_j_serials, constant_rs, constrain_ks, constrain_pair);
+      constrain_pair_numbers, atom_i_serials, atom_j_serials, constant_rs, constrain_ks, constrain_pair);

  Constrain_Force_Cycle_Kernel<<<block_per_grid, thread_per_block, 0, stream>>>(
-    constrain_pair_numbers, uint_crd, scaler, constrain_pair, pair_dr, test_frc);
+      constrain_pair_numbers, uint_crd, scaler, constrain_pair, pair_dr, test_frc);

  return;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_impl.cuh
@ -13,6 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  Constrain_Force_Cycle. This is an experimental interface that is subject to change and/or deletion.
+ */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_CYCLE_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_CYCLE_IMPL_H_
@ -24,4 +28,5 @@ void Constrain_Force_Cycle(int atom_numbers, int constrain_pair_numbers, const u
                           const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f,
                           const int *atom_i_serials, const int *atom_j_serials, const float *constant_rs,
                           const float *constrain_ks, float *test_frc_f, cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_CYCLE_WITH_VIRIAL_IMPL_H_
+
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cu
@ -13,9 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  ConstrainForceCycleVirial. This is an experimental interface that is subject to change and/or deletion.
+ */

-#include "backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh"

 __global__ void Constrain_Force_Cycle_With_Virial(int constrain_pair_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
                                                  const VECTOR *scaler, CONSTRAIN_PAIR *constrain_pair,
@ -46,7 +50,8 @@ void Constrain_Force_Cycle_With_Virial(int atom_numbers, int constrain_pair_numb
                                       const float *constrain_ks, float *test_frc_f, float *d_atom_virial,
                                       cudaStream_t stream) {
  Reset_List<<<ceilf(static_cast<float>(3 * atom_numbers) / 128), 128, 0, stream>>>(3 * atom_numbers, test_frc_f, 0.);
-  Reset_List<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(atom_numbers, d_atom_virial, 0.);
+  Reset_List<<<ceilf(static_cast<float>(constrain_pair_numbers) / 128), 128, 0, stream>>>(constrain_pair_numbers,
+                                                                                          d_atom_virial, 0.);
  size_t thread_per_block = 128;
  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
  const UNSIGNED_INT_VECTOR *uint_crd = reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f);
@ -58,10 +63,10 @@ void Constrain_Force_Cycle_With_Virial(int atom_numbers, int constrain_pair_numb
  CONSTRAIN_PAIR *constrain_pair = reinterpret_cast<CONSTRAIN_PAIR *>(constrain_pair_f);

  construct_constrain_pair<<<ceilf(static_cast<float>(constrain_pair_numbers) / 128), 128, 0, stream>>>(
-    constrain_pair_numbers, atom_i_serials, atom_j_serials, constant_rs, constrain_ks, constrain_pair);
+      constrain_pair_numbers, atom_i_serials, atom_j_serials, constant_rs, constrain_ks, constrain_pair);

  Constrain_Force_Cycle_With_Virial<<<block_per_grid, thread_per_block, 0, stream>>>(
-    constrain_pair_numbers, uint_crd, scaler, constrain_pair, pair_dr, test_frc, d_atom_virial);
+      constrain_pair_numbers, uint_crd, scaler, constrain_pair, pair_dr, test_frc, d_atom_virial);

  return;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_cycle_with_virial_impl.cuh
@ -13,6 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  ConstrainForceCycleVirial. This is an experimental interface that is subject to change and/or deletion.
+ */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_CYCLE_WITH_VIRIAL_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_CYCLE_WITH_VIRIAL_IMPL_H_
@ -25,4 +29,4 @@ void Constrain_Force_Cycle_With_Virial(int atom_numbers, int constrain_pair_numb
                                       const int *atom_i_serials, const int *atom_j_serials, const float *constant_rs,
                                       const float *constrain_ks, float *test_frc_f, float *d_atom_virial,
                                       cudaStream_t stream);
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_CYCLE_WITH_VIRIAL_IMPL_H_
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cu
@ -0,0 +1,180 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  ConstrainForceCycleVirial. This is an experimental interface that is subject to change and/or deletion.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh"
+
+__global__ void constrain_force_cycle_with_virial_update_kernel(int constrain_pair_numbers,
+                                                                const UNSIGNED_INT_VECTOR *uint_crd,
+                                                                const VECTOR *scaler, CONSTRAIN_PAIR *constrain_pair,
+                                                                const VECTOR *pair_dr, VECTOR *test_frc,
+                                                                float *d_atom_virial) {
+  int pair_i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (pair_i < constrain_pair_numbers) {
+    CONSTRAIN_PAIR cp = constrain_pair[pair_i];
+    VECTOR dr0 = pair_dr[pair_i];
+    VECTOR dr = Get_Periodic_Displacement(uint_crd[cp.atom_i_serial], uint_crd[cp.atom_j_serial], scaler[0]);
+    float r_1 = rnorm3df(dr.x, dr.y, dr.z);
+    float frc_abs = (1. - cp.constant_r * r_1) * cp.constrain_k;
+    VECTOR frc_lin = frc_abs * dr0;
+    d_atom_virial[pair_i] -= frc_lin * dr0;
+
+    atomicAdd(&test_frc[cp.atom_j_serial].x, frc_lin.x);
+    atomicAdd(&test_frc[cp.atom_j_serial].y, frc_lin.y);
+    atomicAdd(&test_frc[cp.atom_j_serial].z, frc_lin.z);
+
+    atomicAdd(&test_frc[cp.atom_i_serial].x, -frc_lin.x);
+    atomicAdd(&test_frc[cp.atom_i_serial].y, -frc_lin.y);
+    atomicAdd(&test_frc[cp.atom_i_serial].z, -frc_lin.z);
+  }
+}
+
+__global__ void constrain_force_cycle_update_kernel(const int constrain_pair_numbers,
+                                                    const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *scaler,
+                                                    const CONSTRAIN_PAIR *constrain_pair, const VECTOR *pair_dr,
+                                                    VECTOR *test_frc) {
+  int pair_i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (pair_i < constrain_pair_numbers) {
+    CONSTRAIN_PAIR cp = constrain_pair[pair_i];
+    float r_1;
+    VECTOR dr;
+    float frc_abs;
+    VECTOR frc_lin;
+
+    dr.x = (static_cast<int>(uint_crd[cp.atom_i_serial].uint_x - uint_crd[cp.atom_j_serial].uint_x)) * scaler[0].x;
+    dr.y = (static_cast<int>(uint_crd[cp.atom_i_serial].uint_y - uint_crd[cp.atom_j_serial].uint_y)) * scaler[0].y;
+    dr.z = (static_cast<int>(uint_crd[cp.atom_i_serial].uint_z - uint_crd[cp.atom_j_serial].uint_z)) * scaler[0].z;
+    r_1 = rnorm3df(dr.x, dr.y, dr.z);
+    frc_abs = (1. - cp.constant_r * r_1) * cp.constrain_k;
+
+    frc_lin.x = frc_abs * pair_dr[pair_i].x;
+    frc_lin.y = frc_abs * pair_dr[pair_i].y;
+    frc_lin.z = frc_abs * pair_dr[pair_i].z;
+
+    atomicAdd(&test_frc[cp.atom_j_serial].x, frc_lin.x);
+    atomicAdd(&test_frc[cp.atom_j_serial].y, frc_lin.y);
+    atomicAdd(&test_frc[cp.atom_j_serial].z, frc_lin.z);
+
+    atomicAdd(&test_frc[cp.atom_i_serial].x, -frc_lin.x);
+    atomicAdd(&test_frc[cp.atom_i_serial].y, -frc_lin.y);
+    atomicAdd(&test_frc[cp.atom_i_serial].z, -frc_lin.z);
+  }
+}
+
+__global__ void refresh_uint_crd_update_kernel(int atom_numbers, const VECTOR *crd,
+                                               const VECTOR *quarter_crd_to_uint_crd_cof, UNSIGNED_INT_VECTOR *uint_crd,
+                                               VECTOR *test_frc, const float *mass_inverse,
+                                               const float half_exp_gamma_plus_half) {
+  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (atom_i < atom_numbers) {
+    INT_VECTOR tempi;
+    VECTOR crd_lin = crd[atom_i];
+    VECTOR frc_lin = test_frc[atom_i];
+    float mass_lin = mass_inverse[atom_i];
+
+    crd_lin.x = crd_lin.x + half_exp_gamma_plus_half * frc_lin.x * mass_lin;
+    crd_lin.y = crd_lin.y + half_exp_gamma_plus_half * frc_lin.y * mass_lin;
+    crd_lin.z = crd_lin.z + half_exp_gamma_plus_half * frc_lin.z * mass_lin;
+
+    tempi.int_x = crd_lin.x * quarter_crd_to_uint_crd_cof[0].x;
+    tempi.int_y = crd_lin.y * quarter_crd_to_uint_crd_cof[0].y;
+    tempi.int_z = crd_lin.z * quarter_crd_to_uint_crd_cof[0].z;
+
+    uint_crd[atom_i].uint_x = tempi.int_x << 2;
+    uint_crd[atom_i].uint_y = tempi.int_y << 2;
+    uint_crd[atom_i].uint_z = tempi.int_z << 2;
+  }
+}
+
+void constrain_force_cycle_update(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f,
+                                  const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f,
+                                  const int *atom_i_serials, const int *atom_j_serials, const float *constant_rs,
+                                  const float *constrain_ks, float *test_frc_f, cudaStream_t stream) {
+  size_t thread_per_block = 128;
+  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
+  const UNSIGNED_INT_VECTOR *uint_crd = reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f);
+  const VECTOR *scaler = reinterpret_cast<const VECTOR *>(scaler_f);
+  const VECTOR *pair_dr = reinterpret_cast<const VECTOR *>(pair_dr_f);
+
+  VECTOR *test_frc = reinterpret_cast<VECTOR *>(test_frc_f);
+
+  CONSTRAIN_PAIR *constrain_pair = reinterpret_cast<CONSTRAIN_PAIR *>(constrain_pair_f);
+
+  construct_constrain_pair<<<ceilf(static_cast<float>(constrain_pair_numbers) / 128), 128, 0, stream>>>(
+    constrain_pair_numbers, atom_i_serials, atom_j_serials, constant_rs, constrain_ks, constrain_pair);
+
+  constrain_force_cycle_update_kernel<<<block_per_grid, thread_per_block, 0, stream>>>(
+    constrain_pair_numbers, uint_crd, scaler, constrain_pair, pair_dr, test_frc);
+
+  return;
+}
+
+void constrain_force_cycle_with_virial_update(int atom_numbers, int constrain_pair_numbers,
+                                              const unsigned int *uint_crd_f, const float *scaler_f,
+                                              float *constrain_pair_f, const float *pair_dr_f,
+                                              const int *atom_i_serials, const int *atom_j_serials,
+                                              const float *constant_rs, const float *constrain_ks, float *test_frc_f,
+                                              float *d_atom_virial, cudaStream_t stream) {
+  size_t thread_per_block = 128;
+  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
+  const UNSIGNED_INT_VECTOR *uint_crd = reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f);
+  const VECTOR *scaler = reinterpret_cast<const VECTOR *>(scaler_f);
+  const VECTOR *pair_dr = reinterpret_cast<const VECTOR *>(pair_dr_f);
+
+  VECTOR *test_frc = reinterpret_cast<VECTOR *>(test_frc_f);
+
+  CONSTRAIN_PAIR *constrain_pair = reinterpret_cast<CONSTRAIN_PAIR *>(constrain_pair_f);
+
+  construct_constrain_pair<<<ceilf(static_cast<float>(constrain_pair_numbers) / 128), 128, 0, stream>>>(
+    constrain_pair_numbers, atom_i_serials, atom_j_serials, constant_rs, constrain_ks, constrain_pair);
+
+  constrain_force_cycle_with_virial_update_kernel<<<block_per_grid, thread_per_block, 0, stream>>>(
+    constrain_pair_numbers, uint_crd, scaler, constrain_pair, pair_dr, test_frc, d_atom_virial);
+
+  return;
+}
+
+void refresh_uint_crd_update(int atom_numbers, float half_exp_gamma_plus_half, const float *crd_f,
+                             const float *quarter_crd_to_uint_crd_cof_f, float *test_frc_f, const float *mass_inverse,
+                             unsigned int *uint_crd_f, cudaStream_t stream) {
+  size_t thread_per_block = 128;
+  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
+  const VECTOR *crd = reinterpret_cast<const VECTOR *>(crd_f);
+  const VECTOR *quarter_crd_to_uint_crd_cof = reinterpret_cast<const VECTOR *>(quarter_crd_to_uint_crd_cof_f);
+  VECTOR *test_frc = reinterpret_cast<VECTOR *>(test_frc_f);
+  UNSIGNED_INT_VECTOR *uint_crd = reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd_f);
+
+  refresh_uint_crd_update_kernel<<<block_per_grid, thread_per_block, 0, stream>>>(
+    atom_numbers, crd, quarter_crd_to_uint_crd_cof, uint_crd, test_frc, mass_inverse, half_exp_gamma_plus_half);
+  return;
+}
+
+void set_zero_force_with_virial(int atom_numbers, int constrain_pair_numbers, float *test_frc_f, float *d_atom_virial,
+                                cudaStream_t stream) {
+  Reset_List<<<ceilf(static_cast<float>(3 * atom_numbers) / 128), 128, 0, stream>>>(3 * atom_numbers, test_frc_f, 0.);
+  Reset_List<<<ceilf(static_cast<float>(constrain_pair_numbers) / 128), 128, 0, stream>>>(constrain_pair_numbers,
+                                                                                          d_atom_virial, 0.);
+  return;
+}
+
+void set_zero(int numbers, float *x, cudaStream_t stream) {
+  Reset_List<<<ceilf(static_cast<float>(numbers) / 128), 128, 0, stream>>>(numbers, x, 0.);
+  return;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh
@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  ConstrainForce. This is an experimental interface that is subject to change and/or deletion.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_VIRIAL_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_VIRIAL_IMPL_H_
+
+#include <curand_kernel.h>
+#include "runtime/device/gpu/cuda_common.h"
+
+void constrain_force_cycle_update(int atom_numbers, int constrain_pair_numbers, const unsigned int *uint_crd_f,
+                                  const float *scaler_f, float *constrain_pair_f, const float *pair_dr_f,
+                                  const int *atom_i_serials, const int *atom_j_serials, const float *constant_rs,
+                                  const float *constrain_ks, float *test_frc_f, cudaStream_t stream);
+
+void constrain_force_cycle_with_virial_update(int atom_numbers, int constrain_pair_numbers,
+                                              const unsigned int *uint_crd_f, const float *scaler_f,
+                                              float *constrain_pair_f, const float *pair_dr_f,
+                                              const int *atom_i_serials, const int *atom_j_serials,
+                                              const float *constant_rs, const float *constrain_ks, float *test_frc_f,
+                                              float *d_atom_virial, cudaStream_t stream);
+
+void refresh_uint_crd_update(int atom_numbers, float half_exp_gamma_plus_half, const float *crd_f,
+                             const float *quarter_crd_to_uint_crd_cof_f, float *test_frc_f, const float *mass_inverse,
+                             unsigned int *uint_crd_f, cudaStream_t stream);
+
+void set_zero_force_with_virial(int atom_numbers, int constrain_pair_numbers, float *test_frc_f, float *d_atom_virial,
+                                cudaStream_t stream);
+
+void set_zero(int numbers, float *x, cudaStream_t stream);
+
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.cc
@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  LJForceWithVirialEnergy. This is an experimental interface that is subject to change and/or deletion.
+ */
+#include "backend/kernel_compiler/gpu/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_THREE(LJForceWithVirialEnergy,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeUInt32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32),
+                        LJForceWithVirialEnergyGpuKernel, float, int, unsigned int)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_kernel.h
@ -0,0 +1,142 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  LJForceWithVirialEnergy. This is an experimental interface that is subject to change and/or deletion.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LJ_DIRECT_CF_FORCE_WITH_LJ_VIRIAL_DIRECT_CF_ENERGY_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LJ_DIRECT_CF_FORCE_WITH_LJ_VIRIAL_DIRECT_CF_ENERGY_KERNEL_H_
+#include <cuda_runtime_api.h>
+#include <vector>
+#include <string>
+#include <map>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "runtime/device/gpu/cuda_common.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh"
+namespace mindspore {
+namespace kernel {
+template <typename T, typename T1, typename T2>
+class LJForceWithVirialEnergyGpuKernel : public GpuKernel {
+ public:
+  LJForceWithVirialEnergyGpuKernel() : ele_uint_crd(1) {}
+  ~LJForceWithVirialEnergyGpuKernel() override = default;
+
+  bool Init(const CNodePtr &kernel_node) override {
+    kernel_node_ = kernel_node;
+    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
+    max_neighbor_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "max_neighbor_numbers"));
+    cutoff = static_cast<float>(GetAttr<float_t>(kernel_node, "cutoff"));
+    pme_beta = static_cast<float>(GetAttr<float_t>(kernel_node, "pme_beta"));
+
+    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+    auto shape_scaler = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
+    auto shape_nl_numbers = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
+    auto shape_nl_serial = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
+    auto shape_d_LJ_a = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
+    auto shape_d_LJ_b = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
+
+    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
+    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
+    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
+    for (size_t i = 0; i < shape_scaler.size(); i++) ele_scaler *= shape_scaler[i];
+    for (size_t i = 0; i < shape_d_LJ_a.size(); i++) ele_d_LJ_a *= shape_d_LJ_a[i];
+    for (size_t i = 0; i < shape_d_LJ_b.size(); i++) ele_d_LJ_b *= shape_d_LJ_b[i];
+
+    InitSizeLists();
+    return true;
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    auto uint_crd = GetDeviceAddress<T2>(inputs, 0);
+    auto LJtype = GetDeviceAddress<T1>(inputs, 1);
+    auto charge = GetDeviceAddress<T>(inputs, 2);
+    auto scaler = GetDeviceAddress<T>(inputs, 3);
+    auto nl_numbers = GetDeviceAddress<T1>(inputs, 4);
+    auto nl_serial = GetDeviceAddress<T1>(inputs, 5);
+    auto d_LJ_a = GetDeviceAddress<T>(inputs, 6);
+    auto d_LJ_b = GetDeviceAddress<T>(inputs, 7);
+
+    auto uint_crd_with_LJ = GetDeviceAddress<T>(workspace, 0);
+    auto nl = GetDeviceAddress<T1>(workspace, 1);
+
+    auto frc = GetDeviceAddress<T>(outputs, 0);
+    auto atom_lj_virial = GetDeviceAddress<T>(outputs, 1);
+    auto atom_energy = GetDeviceAddress<T>(outputs, 2);
+    LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy(atom_numbers, cutoff, pme_beta, uint_crd, LJtype, charge, scaler,
+                                                       uint_crd_with_LJ, nl_numbers, nl_serial, nl, d_LJ_a, d_LJ_b, frc,
+                                                       atom_lj_virial, atom_energy, max_neighbor_numbers,
+                                                       reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(ele_uint_crd * sizeof(T2));
+    input_size_list_.push_back(ele_LJtype * sizeof(T1));
+    input_size_list_.push_back(ele_charge * sizeof(T));
+    input_size_list_.push_back(ele_scaler * sizeof(T));
+    input_size_list_.push_back(atom_numbers * sizeof(T1));
+    input_size_list_.push_back(max_neighbor_numbers * sizeof(T1));
+    input_size_list_.push_back(ele_d_LJ_a * sizeof(T));
+    input_size_list_.push_back(ele_d_LJ_b * sizeof(T));
+
+    workspace_size_list_.push_back(atom_numbers * max_neighbor_numbers * sizeof(T1));
+    workspace_size_list_.push_back(atom_numbers * sizeof(UINT_VECTOR_LJ_TYPE));
+
+    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
+    output_size_list_.push_back(atom_numbers * sizeof(T));
+    output_size_list_.push_back(atom_numbers * sizeof(T));
+  }
+
+ private:
+  size_t ele_uint_crd = 1;
+  size_t ele_LJtype = 1;
+  size_t ele_charge = 1;
+  size_t ele_scaler = 1;
+  size_t ele_nl = 1;
+  size_t ele_d_LJ_a = 1;
+  size_t ele_d_LJ_b = 1;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  int atom_numbers;
+  int max_neighbor_numbers;
+  float pme_beta;
+  float cutoff;
+  struct UINT_VECTOR_LJ_TYPE {
+    unsigned int uint_x;
+    unsigned int uint_y;
+    unsigned int uint_z;
+    int LJ_type;
+    float charge;
+  };
+  struct NEIGHBOR_LIST {
+    int atom_numbers;
+    int *atom_serial;
+  };
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_pme_direct_force_update_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_pme_direct_force_update_kernel.cc
@ -0,0 +1,38 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  LJForceWithPMEDirectForceUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#include "backend/kernel_compiler/gpu/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_TWO(LJForceWithPMEDirectForceUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      LJForceWithPMEDirectForceUpdateGpuKernel, float, int)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_pme_direct_force_update_kernel.h
@ -0,0 +1,151 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  LJForceWithPMEDirectForceUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_LJ_LJ_FORCE_WITH_PME_DIRECT_FORCE_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_LJ_LJ_FORCE_WITH_PME_DIRECT_FORCE_KERNEL_H_
+#include <cuda_runtime_api.h>
+#include <vector>
+#include <string>
+#include <map>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "runtime/device/gpu/cuda_common.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh"
+namespace mindspore {
+namespace kernel {
+template <typename T, typename T1>
+class LJForceWithPMEDirectForceUpdateGpuKernel : public GpuKernel {
+ public:
+  LJForceWithPMEDirectForceUpdateGpuKernel() : ele_uint_crd(1) {}
+  ~LJForceWithPMEDirectForceUpdateGpuKernel() override = default;
+
+  bool Init(const CNodePtr &kernel_node) override {
+    kernel_node_ = kernel_node;
+    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
+    cutoff = static_cast<float>(GetAttr<float_t>(kernel_node, "cutoff"));
+    pme_beta = static_cast<float>(GetAttr<float_t>(kernel_node, "pme_beta"));
+    need_update = static_cast<int>(GetAttr<int64_t>(kernel_node, "need_update"));
+
+    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+    auto shape_scaler = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
+    auto shape_nl_numbers = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
+    auto shape_nl_serial = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
+    auto shape_d_LJ_a = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
+    auto shape_d_LJ_b = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
+
+    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
+    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
+    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
+    for (size_t i = 0; i < shape_scaler.size(); i++) ele_scaler *= shape_scaler[i];
+    for (size_t i = 0; i < shape_d_LJ_a.size(); i++) ele_d_LJ_a *= shape_d_LJ_a[i];
+    for (size_t i = 0; i < shape_d_LJ_b.size(); i++) ele_d_LJ_b *= shape_d_LJ_b[i];
+
+    InitSizeLists();
+    return true;
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    auto uint_crd = GetDeviceAddress<T1>(inputs, 0);
+    auto LJtype = GetDeviceAddress<T1>(inputs, 1);
+    auto charge = GetDeviceAddress<T>(inputs, 2);
+    auto scaler = GetDeviceAddress<T>(inputs, 3);
+    auto nl_numbers = GetDeviceAddress<T1>(inputs, 4);
+    auto nl_serial = GetDeviceAddress<T1>(inputs, 5);
+    auto d_LJ_a = GetDeviceAddress<T>(inputs, 6);
+    auto d_LJ_b = GetDeviceAddress<T>(inputs, 7);
+    auto d_beta = GetDeviceAddress<T>(inputs, 8);
+
+    if (need_update) {
+      cudaMemcpyAsync(&pme_beta, d_beta, sizeof(float), cudaMemcpyDeviceToHost,
+                      reinterpret_cast<cudaStream_t>(stream_ptr));
+      cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr));
+    }
+
+    auto uint_crd_with_LJ = GetDeviceAddress<T>(workspace, 0);
+    auto nl = GetDeviceAddress<T1>(workspace, 1);
+
+    auto frc = GetDeviceAddress<T>(outputs, 0);
+    LJForceWithPMEDirectForce(atom_numbers, cutoff, pme_beta, uint_crd, LJtype, charge, scaler, uint_crd_with_LJ,
+                              nl_numbers, nl_serial, nl, d_LJ_a, d_LJ_b, frc,
+                              reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
+    input_size_list_.push_back(ele_LJtype * sizeof(T1));
+    input_size_list_.push_back(ele_charge * sizeof(T));
+    input_size_list_.push_back(ele_scaler * sizeof(T));
+    input_size_list_.push_back(atom_numbers * sizeof(T1));
+    input_size_list_.push_back(max_nl_numbers * sizeof(T1));
+    input_size_list_.push_back(ele_d_LJ_a * sizeof(T));
+    input_size_list_.push_back(ele_d_LJ_b * sizeof(T));
+    input_size_list_.push_back(sizeof(T));
+
+    workspace_size_list_.push_back(atom_numbers * max_nl_numbers * sizeof(T1));
+    workspace_size_list_.push_back(atom_numbers * sizeof(UINT_VECTOR_LJ_TYPE));
+
+    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
+  }
+
+ private:
+  size_t ele_uint_crd = 1;
+  size_t ele_LJtype = 1;
+  size_t ele_charge = 1;
+  size_t ele_scaler = 1;
+  size_t ele_nl = 1;
+  size_t ele_d_LJ_a = 1;
+  size_t ele_d_LJ_b = 1;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  int atom_numbers;
+  float pme_beta;
+  float cutoff;
+  int need_update;
+  int max_nl_numbers = 800;
+  struct UINT_VECTOR_LJ_TYPE {
+    unsigned int uint_x;
+    unsigned int uint_y;
+    unsigned int uint_z;
+    int LJ_type;
+    float charge;
+  };
+  struct NEIGHBOR_LIST {
+    int atom_numbers;
+    int *atom_serial;
+  };
+  struct VECTOR {
+    float x;
+    float y;
+    float z;
+  };
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_virial_energy_update_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_virial_energy_update_kernel.cc
@ -0,0 +1,40 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  LJForceWithVirialEnergyUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#include "backend/kernel_compiler/gpu/sponge/lj/lj_force_with_virial_energy_update_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_THREE(LJForceWithVirialEnergyUpdate,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeUInt32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32),
+                        LJForceWithVirialEnergyUpdateGpuKernel, float, int, unsigned int)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_virial_energy_update_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_virial_energy_update_kernel.h
@ -0,0 +1,152 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  LJForceWithVirialEnergyUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LJ_FORCE_WITH_VIRIAL_ENERGY_UPDATE_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_LJ_FORCE_WITH_VIRIAL_ENERGY_UPDATE_KERNEL_H_
+#include <cuda_runtime_api.h>
+#include <vector>
+#include <string>
+#include <map>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "runtime/device/gpu/cuda_common.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_direct_cf_force_with_lj_virial_direct_cf_energy_impl.cuh"
+namespace mindspore {
+namespace kernel {
+template <typename T, typename T1, typename T2>
+class LJForceWithVirialEnergyUpdateGpuKernel : public GpuKernel {
+ public:
+  LJForceWithVirialEnergyUpdateGpuKernel() : ele_uint_crd(1) {}
+  ~LJForceWithVirialEnergyUpdateGpuKernel() override = default;
+
+  bool Init(const CNodePtr &kernel_node) override {
+    kernel_node_ = kernel_node;
+    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
+    max_neighbor_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "max_neighbor_numbers"));
+    cutoff = static_cast<float>(GetAttr<float_t>(kernel_node, "cutoff"));
+    pme_beta = static_cast<float>(GetAttr<float_t>(kernel_node, "pme_beta"));
+    need_update = static_cast<int>(GetAttr<int64_t>(kernel_node, "need_update"));
+
+    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+    auto shape_scaler = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
+    auto shape_nl_numbers = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
+    auto shape_nl_serial = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
+    auto shape_d_LJ_a = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
+    auto shape_d_LJ_b = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
+
+    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
+    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
+    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
+    for (size_t i = 0; i < shape_scaler.size(); i++) ele_scaler *= shape_scaler[i];
+    for (size_t i = 0; i < shape_d_LJ_a.size(); i++) ele_d_LJ_a *= shape_d_LJ_a[i];
+    for (size_t i = 0; i < shape_d_LJ_b.size(); i++) ele_d_LJ_b *= shape_d_LJ_b[i];
+
+    InitSizeLists();
+    return true;
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    auto uint_crd = GetDeviceAddress<T2>(inputs, 0);
+    auto LJtype = GetDeviceAddress<T1>(inputs, 1);
+    auto charge = GetDeviceAddress<T>(inputs, 2);
+    auto scaler = GetDeviceAddress<T>(inputs, 3);
+    auto nl_numbers = GetDeviceAddress<T1>(inputs, 4);
+    auto nl_serial = GetDeviceAddress<T1>(inputs, 5);
+    auto d_LJ_a = GetDeviceAddress<T>(inputs, 6);
+    auto d_LJ_b = GetDeviceAddress<T>(inputs, 7);
+    auto d_beta = GetDeviceAddress<T>(inputs, 8);
+
+    auto uint_crd_with_LJ = GetDeviceAddress<T>(workspace, 0);
+    auto nl = GetDeviceAddress<T1>(workspace, 1);
+
+    auto frc = GetDeviceAddress<T>(outputs, 0);
+    auto atom_lj_virial = GetDeviceAddress<T>(outputs, 1);
+    auto atom_energy = GetDeviceAddress<T>(outputs, 2);
+
+    if (need_update) {
+      cudaMemcpyAsync(&pme_beta, d_beta, sizeof(float), cudaMemcpyDeviceToHost,
+                      reinterpret_cast<cudaStream_t>(stream_ptr));
+      cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr));
+    }
+
+    LJ_Direct_CF_Force_With_LJ_Virial_Direct_CF_Energy(atom_numbers, cutoff, pme_beta, uint_crd, LJtype, charge, scaler,
+                                                       uint_crd_with_LJ, nl_numbers, nl_serial, nl, d_LJ_a, d_LJ_b, frc,
+                                                       atom_lj_virial, atom_energy, max_neighbor_numbers,
+                                                       reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(ele_uint_crd * sizeof(T2));
+    input_size_list_.push_back(ele_LJtype * sizeof(T1));
+    input_size_list_.push_back(ele_charge * sizeof(T));
+    input_size_list_.push_back(ele_scaler * sizeof(T));
+    input_size_list_.push_back(atom_numbers * sizeof(T1));
+    input_size_list_.push_back(max_neighbor_numbers * sizeof(T1));
+    input_size_list_.push_back(ele_d_LJ_a * sizeof(T));
+    input_size_list_.push_back(ele_d_LJ_b * sizeof(T));
+
+    workspace_size_list_.push_back(atom_numbers * max_neighbor_numbers * sizeof(T1));
+    workspace_size_list_.push_back(atom_numbers * sizeof(UINT_VECTOR_LJ_TYPE));
+
+    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
+    output_size_list_.push_back(atom_numbers * sizeof(T));
+    output_size_list_.push_back(atom_numbers * sizeof(T));
+  }
+
+ private:
+  size_t ele_uint_crd = 1;
+  size_t ele_LJtype = 1;
+  size_t ele_charge = 1;
+  size_t ele_scaler = 1;
+  size_t ele_nl = 1;
+  size_t ele_d_LJ_a = 1;
+  size_t ele_d_LJ_b = 1;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  int atom_numbers;
+  int max_neighbor_numbers;
+  float pme_beta;
+  float cutoff;
+  int need_update;
+  struct UINT_VECTOR_LJ_TYPE {
+    unsigned int uint_x;
+    unsigned int uint_y;
+    unsigned int uint_z;
+    int LJ_type;
+    float charge;
+  };
+  struct NEIGHBOR_LIST {
+    int atom_numbers;
+    int *atom_serial;
+  };
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_kernel.h
@ -13,14 +13,19 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  MDIterationLeapFrog. This is an experimental interface that is subject to change and/or deletion.
+ */

-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NVTIT_MD_ITERATION_LEAP_FROG_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NVTIT_MD_ITERATION_LEAP_FROG_KERNEL_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MD_ITERATION_LEAP_FROG_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MD_ITERATION_LEAP_FROG_KERNEL_H_

 #include <cuda_runtime_api.h>
-#include <vector>
-#include <string>
 #include <map>
+#include <string>
+#include <vector>
+
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
@ -31,25 +36,13 @@ namespace kernel {
 template <typename T>
 class MDIterationLeapFrogGpuKernel : public GpuKernel {
 public:
-  MDIterationLeapFrogGpuKernel() : ele_mass_inverse(1) {}
+  MDIterationLeapFrogGpuKernel() {}
  ~MDIterationLeapFrogGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
-    float4_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "float4_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
-    half_dt = static_cast<float>(GetAttr<float>(kernel_node, "half_dt"));
    dt = static_cast<float>(GetAttr<float>(kernel_node, "dt"));
-    exp_gamma = static_cast<float>(GetAttr<float>(kernel_node, "exp_gamma"));
-    is_max_velocity = static_cast<int>(GetAttr<int64_t>(kernel_node, "is_max_velocity"));
-    max_velocity = static_cast<float>(GetAttr<float>(kernel_node, "max_velocity"));
-
-    auto shape_mass_inverse = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-    auto shape_qrt_mass = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
-
-    for (size_t i = 0; i < shape_mass_inverse.size(); i++) ele_mass_inverse *= shape_mass_inverse[i];
-    for (size_t i = 0; i < shape_qrt_mass.size(); i++) ele_sqrt_mass *= shape_qrt_mass[i];
-
    InitSizeLists();
    return true;
  }
@ -58,49 +51,36 @@ class MDIterationLeapFrogGpuKernel : public GpuKernel {
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    auto d_mass_inverse = GetDeviceAddress<const T>(inputs, 0);
-    auto d_sqrt_mass = GetDeviceAddress<const T>(inputs, 1);
-
-    auto vel_f = GetDeviceAddress<T>(outputs, 0);
-    auto crd_f = GetDeviceAddress<T>(outputs, 1);
-    auto frc_f = GetDeviceAddress<T>(outputs, 2);
-    auto acc_f = GetDeviceAddress<T>(outputs, 3);
-
-    MDIterationLeapFrog(float4_numbers, atom_numbers, half_dt, dt, exp_gamma, is_max_velocity, max_velocity,
-                        d_mass_inverse, d_sqrt_mass, vel_f, crd_f, frc_f, acc_f,
-                        reinterpret_cast<cudaStream_t>(stream_ptr));
+    auto vel = GetDeviceAddress<float>(inputs, 0);
+    auto crd = GetDeviceAddress<float>(inputs, 1);
+    auto frc = GetDeviceAddress<float>(inputs, 2);
+    auto acc = GetDeviceAddress<float>(inputs, 3);
+    auto inverse_mass = GetDeviceAddress<float>(inputs, 4);

+    MDIterationLeapFrog(atom_numbers, vel, crd, frc, acc, inverse_mass, dt, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

 protected:
  void InitSizeLists() override {
-    input_size_list_.push_back(ele_mass_inverse * sizeof(T));
-    input_size_list_.push_back(ele_sqrt_mass * sizeof(T));
+    input_size_list_.push_back(atom_numbers * 3 * sizeof(T));
+    input_size_list_.push_back(atom_numbers * 3 * sizeof(T));
+    input_size_list_.push_back(atom_numbers * 3 * sizeof(T));
+    input_size_list_.push_back(atom_numbers * 3 * sizeof(T));
+    input_size_list_.push_back(atom_numbers * sizeof(T));

-    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
-    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
-    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
-    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
+    output_size_list_.push_back(sizeof(T));
  }

 private:
-  size_t ele_mass_inverse = 1;
-  size_t ele_sqrt_mass = 1;
-
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
-  int float4_numbers;
  int atom_numbers;
-  float half_dt;
  float dt;
-  float exp_gamma;
-  int is_max_velocity;
-  float max_velocity;
 };
 }  // namespace kernel
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NVTIT_MD_ITERATION_LEAP_FROG_KERNEL_H_
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_update_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_update_kernel.cc
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  PMEEnergyUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#include "backend/kernel_compiler/gpu/sponge/pme/pme_energy_update_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_TWO(PMEEnergyUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      PMEEnergyUpdateGpuKernel, float, int)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_update_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_update_kernel.h
@ -0,0 +1,313 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  PMEEnergyUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_ENERGY_UPDATE_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_ENERGY_UPDATE_KERNEL_H_
+#include <cuda_runtime_api.h>
+#include <cufft.h>
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "runtime/device/gpu/cuda_common.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_update_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename T1>
+class PMEEnergyUpdateGpuKernel : public GpuKernel {
+ public:
+  PMEEnergyUpdateGpuKernel() : ele_uint_crd(1) {}
+  ~PMEEnergyUpdateGpuKernel() override = default;
+
+  bool Init(const CNodePtr &kernel_node) override {
+    kernel_node_ = kernel_node;
+    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
+    excluded_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "excluded_numbers"));
+    beta = static_cast<float>(GetAttr<float_t>(kernel_node, "beta"));
+    fftx = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftx"));
+    ffty = static_cast<int>(GetAttr<int64_t>(kernel_node, "ffty"));
+    fftz = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftz"));
+
+    float box_length_0 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_0"));
+    float box_length_1 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_1"));
+    float box_length_2 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_2"));
+    max_neighbor_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "max_neighbor_numbers"));
+    need_update = static_cast<int>(GetAttr<int64_t>(kernel_node, "need_update"));
+
+    cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
+    cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
+    _thread_PME.x = 8;
+    _thread_PME.y = 8;
+    PME_Nin = ffty * fftz;
+    PME_Nfft = fftx * ffty * (fftz / 2 + 1);
+    PME_Nall = fftx * ffty * fftz;
+    PME_kxyz_cpu.resize(64);
+
+    std::vector<float> h_box_length{box_length_0, box_length_1, box_length_2};
+    VECTOR *box_length = reinterpret_cast<VECTOR *>(h_box_length.data());
+
+    volume = box_length[0].x * box_length[0].y * box_length[0].z;
+    int kx, ky, kz, kxrp, kyrp, kzrp, index;
+    for (kx = 0; kx < 4; kx++) {
+      for (ky = 0; ky < 4; ky++) {
+        for (kz = 0; kz < 4; kz++) {
+          index = kx * 16 + ky * 4 + kz;
+          PME_kxyz_cpu[index].uint_x = kx;
+          PME_kxyz_cpu[index].uint_y = ky;
+          PME_kxyz_cpu[index].uint_z = kz;
+        }
+      }
+    }
+
+    B1.resize(fftx);
+    B2.resize(ffty);
+    B3.resize(fftz);
+    PME_BC0.resize(PME_Nfft);
+    for (kx = 0; kx < fftx; kx++) {
+      B1[kx] = getb(kx, fftx, 4);
+    }
+
+    for (ky = 0; ky < ffty; ky++) {
+      B2[ky] = getb(ky, ffty, 4);
+    }
+
+    for (kz = 0; kz < fftz; kz++) {
+      B3[kz] = getb(kz, fftz, 4);
+    }
+    float mprefactor = PI * PI / -beta / beta;
+
+    float msq;
+    for (kx = 0; kx < fftx; kx++) {
+      kxrp = kx;
+      if (kx > fftx / 2) kxrp = fftx - kx;
+      for (ky = 0; ky < ffty; ky++) {
+        kyrp = ky;
+        if (ky > ffty / 2) kyrp = ffty - ky;
+        for (kz = 0; kz <= fftz / 2; kz++) {
+          kzrp = kz;
+
+          msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
+                kzrp * kzrp / box_length[0].z / box_length[0].z;
+          index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
+          if ((kx + ky + kz) == 0) {
+            PME_BC0[index] = 0;
+          } else {
+            PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
+          }
+
+          PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
+        }
+      }
+    }
+
+    InitSizeLists();
+    return true;
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    auto uint_crd = GetDeviceAddress<T1>(inputs, 0);
+    auto charge = GetDeviceAddress<T>(inputs, 1);
+    auto nl_numbers = GetDeviceAddress<T1>(inputs, 2);
+    auto nl_serial = GetDeviceAddress<T1>(inputs, 3);
+    auto scaler = GetDeviceAddress<T>(inputs, 4);
+    auto excluded_list_start = GetDeviceAddress<int>(inputs, 5);
+    auto excluded_list = GetDeviceAddress<int>(inputs, 6);
+    auto excluded_atom_numbers = GetDeviceAddress<int>(inputs, 7);
+    auto neutralizing_factor = GetDeviceAddress<T>(inputs, 8);
+    auto d_beta = GetDeviceAddress<T>(inputs, 9);
+
+    auto pme_uxyz = GetDeviceAddress<int>(workspace, 0);       // workspace
+    auto pme_frxyz = GetDeviceAddress<float>(workspace, 1);    // workspace
+    auto pme_q = GetDeviceAddress<T>(workspace, 2);            // workspace
+    auto pme_fq = GetDeviceAddress<float>(workspace, 3);       // workspace
+    auto pme_atom_near = GetDeviceAddress<int>(workspace, 4);  // workspace
+    auto pme_bc = GetDeviceAddress<float>(workspace, 5);       // workspace
+    auto pme_kxyz = GetDeviceAddress<int>(workspace, 6);       // workspace
+    auto nl = GetDeviceAddress<T1>(workspace, 7);
+    auto charge_sum = GetDeviceAddress<float>(workspace, 8);
+
+    auto reciprocal_ene = GetDeviceAddress<T>(outputs, 0);
+    auto self_ene = GetDeviceAddress<T>(outputs, 1);
+    auto direct_ene = GetDeviceAddress<T>(outputs, 2);
+    auto correction_ene = GetDeviceAddress<T>(outputs, 3);
+
+    h_beta = beta;
+    if (need_update) {
+      cudaMemcpyAsync(&h_beta, d_beta, sizeof(float), cudaMemcpyDeviceToHost,
+                      reinterpret_cast<cudaStream_t>(stream_ptr));
+      cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr));
+      factor = h_beta / beta;
+      double factor_inverse = 1.0 / factor;
+
+      for (int i = 0; i < PME_Nfft; i++) {
+        PME_BC0[i] *= factor_inverse;  // update PME_BC0
+      }
+    }
+
+    cufftSetStream(PME_plan_r2c, reinterpret_cast<cudaStream_t>(stream_ptr));
+    cufftSetStream(PME_plan_c2r, reinterpret_cast<cudaStream_t>(stream_ptr));
+    cudaMemcpyAsync(pme_kxyz, PME_kxyz_cpu.data(), sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice,
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
+    cudaMemcpyAsync(pme_bc, PME_BC0.data(), sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice,
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
+
+    PMEEnergyUpdate(fftx, ffty, fftz, atom_numbers, h_beta, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq, pme_atom_near,
+                    pme_kxyz, uint_crd, charge, nl_numbers, nl_serial, nl, scaler, excluded_list_start, excluded_list,
+                    excluded_atom_numbers, reciprocal_ene, self_ene, direct_ene, correction_ene, _thread_PME, PME_Nin,
+                    PME_Nfft, PME_Nall, PME_plan_r2c, PME_plan_c2r, neutralizing_factor, charge_sum,
+                    max_neighbor_numbers, reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
+    input_size_list_.push_back(atom_numbers * sizeof(VECTOR));
+    input_size_list_.push_back(atom_numbers * sizeof(T1));
+    input_size_list_.push_back(max_neighbor_numbers * sizeof(T1));
+    input_size_list_.push_back(sizeof(VECTOR));
+
+    input_size_list_.push_back(atom_numbers * sizeof(T1));
+    input_size_list_.push_back(excluded_numbers * sizeof(T1));
+    input_size_list_.push_back(atom_numbers * sizeof(T1));
+    input_size_list_.push_back(sizeof(T));
+    input_size_list_.push_back(sizeof(T));
+
+    workspace_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
+    workspace_size_list_.push_back(atom_numbers * sizeof(VECTOR));
+    workspace_size_list_.push_back(PME_Nall * sizeof(T));
+    workspace_size_list_.push_back(PME_Nfft * sizeof(cufftComplex));
+    workspace_size_list_.push_back(atom_numbers * 64 * sizeof(int));
+    workspace_size_list_.push_back(PME_Nfft * sizeof(float));
+    workspace_size_list_.push_back(64 * sizeof(UNSIGNED_INT_VECTOR));
+    workspace_size_list_.push_back(atom_numbers * max_neighbor_numbers * sizeof(T1));
+    workspace_size_list_.push_back(sizeof(T));
+
+    output_size_list_.push_back(sizeof(T));
+    output_size_list_.push_back(sizeof(T));
+    output_size_list_.push_back(sizeof(T));
+    output_size_list_.push_back(sizeof(T));
+  }
+
+  cufftComplex expc(cufftComplex z) {
+    cufftComplex res;
+    float t = expf(z.x);
+    sincosf(z.y, &res.y, &res.x);
+    res.x *= t;
+    res.y *= t;
+    return res;
+  }
+
+  float M_(float u, int n) {
+    if (n == 2) {
+      if (u > 2 || u < 0) return 0;
+      return 1 - abs(u - 1);
+    } else {
+      return u / (n - 1) * M_(u, n - 1) + (n - u) / (n - 1) * M_(u - 1, n - 1);
+    }
+  }
+
+  float getb(int k, int NFFT, int B_order) {
+    cufftComplex tempc, tempc2, res;
+    float tempf;
+    tempc2.x = 0;
+    tempc2.y = 0;
+
+    tempc.x = 0;
+    if (NFFT == 0) {
+      MS_LOG(ERROR) << "Divide by zero.";
+    } else {
+      tempc.y = 2 * (B_order - 1) * PI * k / NFFT;
+    }
+    res = expc(tempc);
+
+    for (int kk = 0; kk < (B_order - 1); kk++) {
+      tempc.x = 0;
+      if (NFFT == 0) {
+        MS_LOG(ERROR) << "Divide by zero.";
+        break;
+      } else {
+        tempc.y = 2 * PI * k / NFFT * kk;
+      }
+      tempc = expc(tempc);
+      tempf = M_(kk + 1, B_order);
+      tempc2.x += tempf * tempc.x;
+      tempc2.y += tempf * tempc.y;
+    }
+    res = cuCdivf(res, tempc2);
+    return res.x * res.x + res.y * res.y;
+  }
+
+ private:
+  size_t ele_uint_crd = 1;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+
+  std::vector<float> B1;
+  std::vector<float> B2;
+  std::vector<float> B3;
+  std::vector<float> PME_BC0;
+
+  int atom_numbers;
+  int excluded_numbers;
+  int max_neighbor_numbers;
+  int fftx;
+  int ffty;
+  int fftz;
+  float beta;
+  float factor;
+  float h_beta;
+  int PME_Nin;
+  int PME_Nall;
+  int PME_Nfft;
+  float volume;
+  float PI = 3.1415926;
+  int need_update;
+  cufftHandle PME_plan_r2c;
+  cufftHandle PME_plan_c2r;
+
+  dim3 _thread_PME;
+
+  struct VECTOR {
+    float x;
+    float y;
+    float z;
+  };
+
+  struct UNSIGNED_INT_VECTOR {
+    unsigned int uint_x;
+    unsigned int uint_y;
+    unsigned int uint_z;
+  };
+  std::vector<UNSIGNED_INT_VECTOR> PME_kxyz_cpu;
+  struct NEIGHBOR_LIST {
+    int atom_numbers;
+    int *atom_serial;
+  };
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_kernel.h
@ -13,17 +13,20 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  PMEExcludedForce. This is an experimental interface that is subject to change and/or deletion.
+ */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_EXCLUDED_FORCE_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_EXCLUDED_FORCE_KERNEL_H_
 #include <cuda_runtime_api.h>
 #include <cufft.h>
 #include <vector>
-#include <string>
-#include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh"
+
 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_update_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_update_kernel.cc
@ -0,0 +1,36 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  PMEExcludedForceUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#include "backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_update_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_TWO(PMEExcludedForceUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      PMEExcludedForceUpdateGpuKernel, float, int)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_update_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_update_kernel.h
@ -0,0 +1,112 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  PMEExcludedForceUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_EXCLUDED_FORCE_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_EXCLUDED_FORCE_KERNEL_H_
+
+#include <cuda_runtime_api.h>
+#include <cufft.h>
+#include <map>
+#include <string>
+#include <vector>
+#include "runtime/device/gpu/cuda_common.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename T1>
+class PMEExcludedForceUpdateGpuKernel : public GpuKernel {
+ public:
+  PMEExcludedForceUpdateGpuKernel() : ele_uint_crd(1) {}
+  ~PMEExcludedForceUpdateGpuKernel() override = default;
+
+  bool Init(const CNodePtr &kernel_node) override {
+    kernel_node_ = kernel_node;
+    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
+    excluded_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "excluded_numbers"));
+    beta = static_cast<float>(GetAttr<float_t>(kernel_node, "beta"));
+    need_update = static_cast<int>(GetAttr<int64_t>(kernel_node, "need_update"));
+    InitSizeLists();
+    return true;
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    auto uint_crd = GetDeviceAddress<int>(inputs, 0);
+    auto sacler = GetDeviceAddress<T>(inputs, 1);
+    auto charge = GetDeviceAddress<T>(inputs, 2);
+    auto excluded_list_start = GetDeviceAddress<int>(inputs, 3);
+    auto excluded_list = GetDeviceAddress<int>(inputs, 4);
+    auto excluded_atom_numbers = GetDeviceAddress<int>(inputs, 5);
+    auto d_beta = GetDeviceAddress<T>(inputs, 6);
+    if (need_update) {
+      cudaMemcpyAsync(&beta, d_beta, sizeof(float), cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream_ptr));
+      cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr));
+    }
+
+    auto force = GetDeviceAddress<T>(outputs, 0);
+    PMEExcludedForce(atom_numbers, beta, uint_crd, sacler, charge, excluded_list_start, excluded_list,
+                     excluded_atom_numbers, force, reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
+    input_size_list_.push_back(sizeof(VECTOR));
+    input_size_list_.push_back(atom_numbers * sizeof(T));
+    input_size_list_.push_back(atom_numbers * sizeof(T1));
+    input_size_list_.push_back(excluded_numbers * sizeof(T1));
+    input_size_list_.push_back(atom_numbers * sizeof(T1));
+    input_size_list_.push_back(sizeof(T));
+
+    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
+  }
+
+ private:
+  size_t ele_uint_crd = 1;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  int atom_numbers;
+  int excluded_numbers;
+  float beta;
+  float factor;
+  int need_update;
+  struct VECTOR {
+    float x;
+    float y;
+    float z;
+  };
+
+  struct UNSIGNED_INT_VECTOR {
+    unsigned int uint_x;
+    unsigned int uint_y;
+    unsigned int uint_z;
+  };
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.h
@ -13,6 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  PMEReciprocalForce. This is an experimental interface that is subject to change and/or deletion.
+ */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_RECIPROCAL_FORCE_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_RECIPROCAL_FORCE_KERNEL_H_
 #include <cuda_runtime_api.h>
@ -144,9 +148,8 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {

 protected:
  void InitSizeLists() override {
-    input_size_list_.push_back(sizeof(VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
-    input_size_list_.push_back(atom_numbers * sizeof(VECTOR));
+    input_size_list_.push_back(atom_numbers * sizeof(float));

    workspace_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
    workspace_size_list_.push_back(atom_numbers * sizeof(VECTOR));
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_update_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_update_kernel.cc
@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  PMEReciprocalForceUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#include "backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_update_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_TWO(PMEReciprocalForceUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      PMEReciprocalForceUpdateGpuKernel, float, int)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_update_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_update_kernel.h
@ -0,0 +1,280 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  PMEReciprocalForceUpdate. This is an experimental interface that is subject to change and/or deletion.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_RECIPROCAL_FORCE_UPDATE_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_RECIPROCAL_FORCE_UPDATE_KERNEL_H_
+#include <cuda_runtime_api.h>
+#include <cufft.h>
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "runtime/device/gpu/cuda_common.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename T1>
+class PMEReciprocalForceUpdateGpuKernel : public GpuKernel {
+ public:
+  PMEReciprocalForceUpdateGpuKernel() : ele_uint_crd(1) {}
+  ~PMEReciprocalForceUpdateGpuKernel() override = default;
+
+  bool Init(const CNodePtr &kernel_node) override {
+    kernel_node_ = kernel_node;
+    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
+    beta = static_cast<float>(GetAttr<float_t>(kernel_node, "beta"));
+    fftx = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftx"));
+    ffty = static_cast<int>(GetAttr<int64_t>(kernel_node, "ffty"));
+    fftz = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftz"));
+
+    float box_length_0 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_0"));
+    float box_length_1 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_1"));
+    float box_length_2 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_2"));
+
+    need_update = static_cast<int>(GetAttr<int64_t>(kernel_node, "need_update"));
+    PME_Nall = fftx * ffty * fftz;
+    PME_Nfft = fftx * ffty * (fftz / 2 + 1);
+    PME_Nin = ffty * fftz;
+    cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
+    cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
+
+    std::vector<float> h_box_length{box_length_0, box_length_1, box_length_2};
+    VECTOR *box_length = reinterpret_cast<VECTOR *>(h_box_length.data());
+
+    PME_inverse_box_vector0.x = static_cast<float>(fftx) / box_length[0].x;
+    PME_inverse_box_vector0.y = static_cast<float>(ffty) / box_length[0].y;
+    PME_inverse_box_vector0.z = static_cast<float>(fftz) / box_length[0].z;
+    PME_inverse_box_vector.x = PME_inverse_box_vector0.x;
+    PME_inverse_box_vector.y = PME_inverse_box_vector0.y;
+    PME_inverse_box_vector.z = PME_inverse_box_vector0.z;
+    float volume = box_length[0].x * box_length[0].y * box_length[0].z;
+    PME_kxyz_cpu.resize(64);
+    int kx, ky, kz, kxrp, kyrp, kzrp, index;
+    for (kx = 0; kx < 4; kx++) {
+      for (ky = 0; ky < 4; ky++) {
+        for (kz = 0; kz < 4; kz++) {
+          index = kx * 16 + ky * 4 + kz;
+          PME_kxyz_cpu[index].uint_x = kx;
+          PME_kxyz_cpu[index].uint_y = ky;
+          PME_kxyz_cpu[index].uint_z = kz;
+        }
+      }
+    }
+    B1.resize(fftx);
+    B2.resize(ffty);
+    B3.resize(fftz);
+    PME_BC0.resize(PME_Nfft);
+
+    for (kx = 0; kx < fftx; kx++) {
+      B1[kx] = getb(kx, fftx, 4);
+    }
+
+    for (ky = 0; ky < ffty; ky++) {
+      B2[ky] = getb(ky, ffty, 4);
+    }
+
+    for (kz = 0; kz < fftz; kz++) {
+      B3[kz] = getb(kz, fftz, 4);
+    }
+    float mprefactor = PI * PI / -beta / beta;
+    float msq;
+    for (kx = 0; kx < fftx; kx++) {
+      kxrp = kx;
+      if (kx > fftx / 2) kxrp = fftx - kx;
+      for (ky = 0; ky < ffty; ky++) {
+        kyrp = ky;
+        if (ky > ffty / 2) kyrp = ffty - ky;
+        for (kz = 0; kz <= fftz / 2; kz++) {
+          kzrp = kz;
+
+          msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
+                kzrp * kzrp / box_length[0].z / box_length[0].z;
+          index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
+          if ((kx + ky + kz) == 0) {
+            PME_BC0[index] = 0;
+          } else {
+            PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
+          }
+
+          PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
+        }
+      }
+    }
+    PME_BC = PME_BC0;
+    InitSizeLists();
+    return true;
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    auto uint_crd = GetDeviceAddress<const T1>(inputs, 0);
+    auto charge = GetDeviceAddress<T>(inputs, 1);
+    auto d_beta = GetDeviceAddress<T>(inputs, 2);
+
+    auto pme_uxyz = GetDeviceAddress<int>(workspace, 0);       // workspace
+    auto pme_frxyz = GetDeviceAddress<float>(workspace, 1);    // workspace
+    auto pme_q = GetDeviceAddress<T>(workspace, 2);            // workspace
+    auto pme_fq = GetDeviceAddress<float>(workspace, 3);       // workspace
+    auto pme_atom_near = GetDeviceAddress<int>(workspace, 4);  // workspace
+    auto pme_bc = GetDeviceAddress<float>(workspace, 5);       // workspace
+    auto pme_kxyz = GetDeviceAddress<int>(workspace, 6);       // workspace
+
+    auto force = GetDeviceAddress<T>(outputs, 0);
+    h_beta = beta;
+    if (need_update) {
+      cudaMemcpyAsync(&h_beta, d_beta, sizeof(float), cudaMemcpyDeviceToHost,
+                      reinterpret_cast<cudaStream_t>(stream_ptr));
+      cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr));
+      factor = h_beta / beta;
+      double factor_inverse = 1.0 / factor;
+
+      PME_inverse_box_vector.x = PME_inverse_box_vector0.x * factor_inverse;
+      PME_inverse_box_vector.y = PME_inverse_box_vector0.y * factor_inverse;
+      PME_inverse_box_vector.z = PME_inverse_box_vector0.z * factor_inverse;
+
+      for (int i = 0; i < PME_Nfft; i++) {
+        PME_BC[i] = PME_BC0[i] * factor_inverse;
+      }
+    }
+
+    cufftSetStream(PME_plan_r2c, reinterpret_cast<cudaStream_t>(stream_ptr));
+    cufftSetStream(PME_plan_c2r, reinterpret_cast<cudaStream_t>(stream_ptr));
+    cudaMemcpyAsync(pme_kxyz, PME_kxyz_cpu.data(), sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice,
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
+    cudaMemcpyAsync(pme_bc, PME_BC.data(), sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice,
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
+    PMEReciprocalForce(fftx, ffty, fftz, atom_numbers, beta, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq, pme_atom_near,
+                       pme_kxyz, uint_crd, charge, force, PME_Nin, PME_Nall, PME_Nfft, PME_plan_r2c, PME_plan_c2r,
+                       PME_inverse_box_vector, reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
+    input_size_list_.push_back(atom_numbers * sizeof(float));
+    input_size_list_.push_back(sizeof(float));
+
+    workspace_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
+    workspace_size_list_.push_back(atom_numbers * sizeof(VECTOR));
+    workspace_size_list_.push_back(PME_Nall * sizeof(T));
+    workspace_size_list_.push_back(PME_Nfft * sizeof(cufftComplex));
+    workspace_size_list_.push_back(atom_numbers * 64 * sizeof(int));
+    workspace_size_list_.push_back(PME_Nfft * sizeof(float));
+    workspace_size_list_.push_back(64 * sizeof(UNSIGNED_INT_VECTOR));
+
+    output_size_list_.push_back(atom_numbers * sizeof(VECTOR));
+  }
+
+  cufftComplex expc(cufftComplex z) {
+    cufftComplex res;
+    float t = expf(z.x);
+    sincosf(z.y, &res.y, &res.x);
+    res.x *= t;
+    res.y *= t;
+    return res;
+  }
+
+  float M_(float u, int n) {
+    if (n == 2) {
+      if (u > 2 || u < 0) return 0;
+      return 1 - abs(u - 1);
+    } else {
+      return u / (n - 1) * M_(u, n - 1) + (n - u) / (n - 1) * M_(u - 1, n - 1);
+    }
+  }
+
+  float getb(int k, int NFFT, int B_order) {
+    cufftComplex tempc, tempc2, res;
+    float tempf;
+    tempc2.x = 0;
+    tempc2.y = 0;
+
+    tempc.x = 0;
+    if (NFFT == 0) {
+      MS_LOG(ERROR) << "Divide by zero.";
+    } else {
+      tempc.y = 2 * (B_order - 1) * PI * k / NFFT;
+    }
+    res = expc(tempc);
+
+    for (int kk = 0; kk < (B_order - 1); kk++) {
+      tempc.x = 0;
+      if (NFFT == 0) {
+        MS_LOG(ERROR) << "Divide by zero.";
+        break;
+      } else {
+        tempc.y = 2 * PI * k / NFFT * kk;
+      }
+      tempc = expc(tempc);
+      tempf = M_(kk + 1, B_order);
+      tempc2.x += tempf * tempc.x;
+      tempc2.y += tempf * tempc.y;
+    }
+    res = cuCdivf(res, tempc2);
+    return res.x * res.x + res.y * res.y;
+  }
+
+ private:
+  size_t ele_uint_crd = 1;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  int atom_numbers;
+  int fftx;
+  int ffty;
+  int fftz;
+  float beta;
+  float h_beta;
+  float factor;
+  int PME_Nall;
+  int PME_Nfft;
+  int PME_Nin;
+  float PI = 3.1415926;
+  int need_update;
+  std::vector<float> B1;
+  std::vector<float> B2;
+  std::vector<float> B3;
+  std::vector<float> PME_BC0;
+  std::vector<float> PME_BC;
+
+  cufftHandle PME_plan_r2c;
+  cufftHandle PME_plan_c2r;
+  struct VECTOR {
+    float x;
+    float y;
+    float z;
+  };
+  _VECTOR PME_inverse_box_vector0;
+  _VECTOR PME_inverse_box_vector;
+  struct UNSIGNED_INT_VECTOR {
+    unsigned int uint_x;
+    unsigned int uint_y;
+    unsigned int uint_z;
+  };
+  std::vector<UNSIGNED_INT_VECTOR> PME_kxyz_cpu;
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.cc
@ -13,12 +13,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  ConstrainForceCycleWithVirial. This is an experimental interface that is subject to change and/or deletion.
+ */

 #include "backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_cycle_with_virial_kernel.h"

 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_THREE(constrainforcecyclewithvirial,
+MS_REG_GPU_KERNEL_THREE(ConstrainForceCycleWithVirial,
                        KernelAttr()
                          .AddInputAttr(kNumberTypeUInt32)
                          .AddInputAttr(kNumberTypeFloat32)
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_kernel.cc
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  ConstrainForce. This is an experimental interface that is subject to change and/or deletion.
+ */
+#include "backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_THREE(ConstrainForce,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeUInt32)
+                          .AddOutputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32),
+                        ConstrainForceGpuKernel, float, int, unsigned int)
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_kernel.h
@ -0,0 +1,162 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  ConstrainForce. This is an experimental interface that is subject to change and/or deletion.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_KERNEL_H_
+
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh"
+
+#include <cuda_runtime_api.h>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "runtime/device/gpu/cuda_common.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename T1, typename T2>
+class ConstrainForceGpuKernel : public GpuKernel {
+ public:
+  ConstrainForceGpuKernel() : ele_crd(1) {}
+  ~ConstrainForceGpuKernel() override = default;
+
+  bool Init(const CNodePtr &kernel_node) override {
+    kernel_node_ = kernel_node;
+    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
+    constrain_pair_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "constrain_pair_numbers"));
+    iteration_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "iteration_numbers"));
+    half_exp_gamma_plus_half = static_cast<float>(GetAttr<float>(kernel_node, "half_exp_gamma_plus_half"));
+
+    auto shape_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto shape_quarter_cof = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    auto shape_mass_inverse = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+
+    auto shape_scaler = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
+    auto shape_pair_dr = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
+    auto shape_atom_i_serials = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
+    auto shape_atom_j_serials = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
+    auto shape_constant_rs = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
+    auto shape_constrain_ks = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 8);
+
+    for (size_t i = 0; i < shape_scaler.size(); i++) ele_scaler *= shape_scaler[i];
+    for (size_t i = 0; i < shape_pair_dr.size(); i++) ele_pair_dr *= shape_pair_dr[i];
+    for (size_t i = 0; i < shape_atom_i_serials.size(); i++) ele_atom_i_serials *= shape_atom_i_serials[i];
+    for (size_t i = 0; i < shape_atom_j_serials.size(); i++) ele_atom_j_serials *= shape_atom_j_serials[i];
+    for (size_t i = 0; i < shape_constant_rs.size(); i++) ele_constant_rs *= shape_constant_rs[i];
+    for (size_t i = 0; i < shape_constrain_ks.size(); i++) ele_constrain_ks *= shape_constrain_ks[i];
+
+    for (size_t i = 0; i < shape_crd.size(); i++) ele_crd *= shape_crd[i];
+    for (size_t i = 0; i < shape_quarter_cof.size(); i++) ele_quarter_cof *= shape_quarter_cof[i];
+    for (size_t i = 0; i < shape_mass_inverse.size(); i++) ele_mass_inverse *= shape_mass_inverse[i];
+
+    InitSizeLists();
+    return true;
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    auto crd = GetDeviceAddress<const T>(inputs, 0);
+    auto quarter_cof = GetDeviceAddress<const T>(inputs, 1);
+    auto mass_inverse = GetDeviceAddress<const T>(inputs, 2);
+
+    auto scaler = GetDeviceAddress<const T>(inputs, 3);
+    auto pair_dr = GetDeviceAddress<const T>(inputs, 4);
+    auto atom_i_serials = GetDeviceAddress<const T1>(inputs, 5);
+    auto atom_j_serials = GetDeviceAddress<const T1>(inputs, 6);
+    auto constant_rs = GetDeviceAddress<const T>(inputs, 7);
+    auto constrain_ks = GetDeviceAddress<const T>(inputs, 8);
+
+    auto constrain_pair = GetDeviceAddress<T>(workspace, 0);
+
+    auto uint_crd = GetDeviceAddress<T2>(outputs, 0);
+
+    auto test_frc_f = GetDeviceAddress<T>(outputs, 1);
+    auto d_atom_virial = GetDeviceAddress<T>(outputs, 2);
+
+    set_zero_force_with_virial(atom_numbers, constrain_pair_numbers, test_frc_f, d_atom_virial,
+                               reinterpret_cast<cudaStream_t>(stream_ptr));
+
+    for (int i = 0; i < iteration_numbers; i++) {
+      refresh_uint_crd_update(atom_numbers, half_exp_gamma_plus_half, crd, quarter_cof, test_frc_f, mass_inverse,
+                              uint_crd, reinterpret_cast<cudaStream_t>(stream_ptr));
+
+      constrain_force_cycle_update(atom_numbers, constrain_pair_numbers, uint_crd, scaler, constrain_pair, pair_dr,
+                                   atom_i_serials, atom_j_serials, constant_rs, constrain_ks, test_frc_f,
+                                   reinterpret_cast<cudaStream_t>(stream_ptr));
+    }
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(ele_crd * sizeof(T));
+    input_size_list_.push_back(ele_quarter_cof * sizeof(T));
+    input_size_list_.push_back(ele_mass_inverse * sizeof(T));
+
+    input_size_list_.push_back(ele_scaler * sizeof(T));
+    input_size_list_.push_back(ele_pair_dr * sizeof(T));
+    input_size_list_.push_back(ele_atom_i_serials * sizeof(T1));
+    input_size_list_.push_back(ele_atom_j_serials * sizeof(T1));
+    input_size_list_.push_back(ele_constant_rs * sizeof(T));
+    input_size_list_.push_back(ele_constrain_ks * sizeof(T));
+
+    workspace_size_list_.push_back(constrain_pair_numbers * sizeof(CONSTRAIN_PAIR));
+
+    output_size_list_.push_back(3 * atom_numbers * sizeof(T2));
+    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
+    output_size_list_.push_back(constrain_pair_numbers * sizeof(T));
+  }
+
+ private:
+  size_t ele_scaler = 1;
+  size_t ele_pair_dr = 1;
+  size_t ele_atom_i_serials = 1;
+  size_t ele_atom_j_serials = 1;
+  size_t ele_constant_rs = 1;
+  size_t ele_constrain_ks = 1;
+  size_t ele_crd = 1;
+  size_t ele_quarter_cof = 1;
+  size_t ele_mass_inverse = 1;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  int atom_numbers;
+  int constrain_pair_numbers;
+  int iteration_numbers;
+  int need_pressure;
+  float half_exp_gamma_plus_half;
+  struct CONSTRAIN_PAIR {
+    int atom_i_serial;
+    int atom_j_serial;
+    float constant_r;
+    float constrain_k;
+  };
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_virial_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_virial_kernel.cc
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  ConstrainForceVirial. This is an experimental interface that is subject to change and/or deletion.
+ */
+
+#include "backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_virial_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_THREE(ConstrainForceVirial,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeUInt32)
+                          .AddOutputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32),
+                        ConstrainForceVirialGpuKernel, float, int, unsigned int)
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_virial_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_force_virial_kernel.h
@ -0,0 +1,159 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  ConstrainForceVirial. This is an experimental interface that is subject to change and/or deletion.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_VIRIAL_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_SIMPLE_CONSTRAIN_CONSTRAIN_FORCE_VIRIAL_KERNEL_H_
+
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh"
+
+#include <cuda_runtime_api.h>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "runtime/device/gpu/cuda_common.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename T1, typename T2>
+class ConstrainForceVirialGpuKernel : public GpuKernel {
+ public:
+  ConstrainForceVirialGpuKernel() : ele_crd(1) {}
+  ~ConstrainForceVirialGpuKernel() override = default;
+
+  bool Init(const CNodePtr &kernel_node) override {
+    kernel_node_ = kernel_node;
+    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
+    constrain_pair_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "constrain_pair_numbers"));
+    iteration_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "iteration_numbers"));
+    half_exp_gamma_plus_half = static_cast<float>(GetAttr<float>(kernel_node, "half_exp_gamma_plus_half"));
+
+    auto shape_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto shape_quarter_cof = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    auto shape_mass_inverse = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+
+    auto shape_scaler = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
+    auto shape_pair_dr = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
+    auto shape_atom_i_serials = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
+    auto shape_atom_j_serials = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
+    auto shape_constant_rs = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
+    auto shape_constrain_ks = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 8);
+
+    for (size_t i = 0; i < shape_scaler.size(); i++) ele_scaler *= shape_scaler[i];
+    for (size_t i = 0; i < shape_pair_dr.size(); i++) ele_pair_dr *= shape_pair_dr[i];
+    for (size_t i = 0; i < shape_atom_i_serials.size(); i++) ele_atom_i_serials *= shape_atom_i_serials[i];
+    for (size_t i = 0; i < shape_atom_j_serials.size(); i++) ele_atom_j_serials *= shape_atom_j_serials[i];
+    for (size_t i = 0; i < shape_constant_rs.size(); i++) ele_constant_rs *= shape_constant_rs[i];
+    for (size_t i = 0; i < shape_constrain_ks.size(); i++) ele_constrain_ks *= shape_constrain_ks[i];
+
+    for (size_t i = 0; i < shape_crd.size(); i++) ele_crd *= shape_crd[i];
+    for (size_t i = 0; i < shape_quarter_cof.size(); i++) ele_quarter_cof *= shape_quarter_cof[i];
+    for (size_t i = 0; i < shape_mass_inverse.size(); i++) ele_mass_inverse *= shape_mass_inverse[i];
+
+    InitSizeLists();
+    return true;
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    auto crd = GetDeviceAddress<const T>(inputs, 0);
+    auto quarter_cof = GetDeviceAddress<const T>(inputs, 1);
+    auto mass_inverse = GetDeviceAddress<const T>(inputs, 2);
+    auto scaler = GetDeviceAddress<const T>(inputs, 3);
+    auto pair_dr = GetDeviceAddress<const T>(inputs, 4);
+    auto atom_i_serials = GetDeviceAddress<const T1>(inputs, 5);
+    auto atom_j_serials = GetDeviceAddress<const T1>(inputs, 6);
+    auto constant_rs = GetDeviceAddress<const T>(inputs, 7);
+    auto constrain_ks = GetDeviceAddress<const T>(inputs, 8);
+    auto constrain_pair = GetDeviceAddress<T>(workspace, 0);
+    auto uint_crd = GetDeviceAddress<T2>(outputs, 0);
+    auto test_frc_f = GetDeviceAddress<T>(outputs, 1);
+    auto d_atom_virial = GetDeviceAddress<T>(outputs, 2);
+
+    set_zero_force_with_virial(atom_numbers, constrain_pair_numbers, test_frc_f, d_atom_virial,
+                               reinterpret_cast<cudaStream_t>(stream_ptr));
+
+    for (int i = 0; i < iteration_numbers; i++) {
+      refresh_uint_crd_update(atom_numbers, half_exp_gamma_plus_half, crd, quarter_cof, test_frc_f, mass_inverse,
+                              uint_crd, reinterpret_cast<cudaStream_t>(stream_ptr));
+
+      constrain_force_cycle_with_virial_update(atom_numbers, constrain_pair_numbers, uint_crd, scaler, constrain_pair,
+                                               pair_dr, atom_i_serials, atom_j_serials, constant_rs, constrain_ks,
+                                               test_frc_f, d_atom_virial, reinterpret_cast<cudaStream_t>(stream_ptr));
+    }
+
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(ele_crd * sizeof(T));
+    input_size_list_.push_back(ele_quarter_cof * sizeof(T));
+    input_size_list_.push_back(ele_mass_inverse * sizeof(T));
+
+    input_size_list_.push_back(ele_scaler * sizeof(T));
+    input_size_list_.push_back(ele_pair_dr * sizeof(T));
+    input_size_list_.push_back(ele_atom_i_serials * sizeof(T1));
+    input_size_list_.push_back(ele_atom_j_serials * sizeof(T1));
+    input_size_list_.push_back(ele_constant_rs * sizeof(T));
+    input_size_list_.push_back(ele_constrain_ks * sizeof(T));
+
+    workspace_size_list_.push_back(constrain_pair_numbers * sizeof(CONSTRAIN_PAIR));
+
+    output_size_list_.push_back(3 * atom_numbers * sizeof(T2));
+
+    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
+    output_size_list_.push_back(constrain_pair_numbers * sizeof(T));
+  }
+
+ private:
+  size_t ele_scaler = 1;
+  size_t ele_pair_dr = 1;
+  size_t ele_atom_i_serials = 1;
+  size_t ele_atom_j_serials = 1;
+  size_t ele_constant_rs = 1;
+  size_t ele_constrain_ks = 1;
+  size_t ele_crd = 1;
+  size_t ele_quarter_cof = 1;
+  size_t ele_mass_inverse = 1;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  int atom_numbers;
+  int constrain_pair_numbers;
+  int iteration_numbers;
+  int need_pressure;
+  float half_exp_gamma_plus_half;
+  struct CONSTRAIN_PAIR {
+    int atom_i_serial;
+    int atom_j_serial;
+    float constant_r;
+    float constrain_k;
+  };
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_kernel.cc
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  Constrain. This is an experimental interface that is subject to change and/or deletion.
+ */
+#include "backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_THREE(Constrain,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeInt32)
+                          .AddOutputAttr(kNumberTypeUInt32)
+                          .AddOutputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32),
+                        ConstrainGpuKernel, float, int, unsigned int)
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/constrain_kernel.h
@ -0,0 +1,176 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *Note:
+ *  Constrain. This is an experimental interface that is subject to change and/or deletion.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_SIMPLE_CONSTRAIN_CONSTRAIN_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_SIMPLE_CONSTRAIN_CONSTRAIN_KERNEL_H_
+
+#include "backend/kernel_compiler/gpu/cuda_impl/sponge/simple_constrain/constrain_force_virial_impl.cuh"
+
+#include <cuda_runtime_api.h>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "runtime/device/gpu/cuda_common.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename T1, typename T2>
+class ConstrainGpuKernel : public GpuKernel {
+ public:
+  ConstrainGpuKernel() : ele_crd(1), step(1) {}
+  ~ConstrainGpuKernel() override = default;
+
+  bool Init(const CNodePtr &kernel_node) override {
+    // get bond_numbers
+    kernel_node_ = kernel_node;
+    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
+    constrain_pair_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "constrain_pair_numbers"));
+    iteration_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "iteration_numbers"));
+    half_exp_gamma_plus_half = static_cast<float>(GetAttr<float>(kernel_node, "half_exp_gamma_plus_half"));
+
+    auto shape_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto shape_quarter_cof = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    auto shape_mass_inverse = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+
+    auto shape_scaler = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
+    auto shape_pair_dr = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
+    auto shape_atom_i_serials = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
+    auto shape_atom_j_serials = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
+    auto shape_constant_rs = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
+    auto shape_constrain_ks = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 8);
+
+    for (size_t i = 0; i < shape_scaler.size(); i++) ele_scaler *= shape_scaler[i];
+    for (size_t i = 0; i < shape_pair_dr.size(); i++) ele_pair_dr *= shape_pair_dr[i];
+    for (size_t i = 0; i < shape_atom_i_serials.size(); i++) ele_atom_i_serials *= shape_atom_i_serials[i];
+    for (size_t i = 0; i < shape_atom_j_serials.size(); i++) ele_atom_j_serials *= shape_atom_j_serials[i];
+    for (size_t i = 0; i < shape_constant_rs.size(); i++) ele_constant_rs *= shape_constant_rs[i];
+    for (size_t i = 0; i < shape_constrain_ks.size(); i++) ele_constrain_ks *= shape_constrain_ks[i];
+
+    for (size_t i = 0; i < shape_crd.size(); i++) ele_crd *= shape_crd[i];
+    for (size_t i = 0; i < shape_quarter_cof.size(); i++) ele_quarter_cof *= shape_quarter_cof[i];
+    for (size_t i = 0; i < shape_mass_inverse.size(); i++) ele_mass_inverse *= shape_mass_inverse[i];
+
+    InitSizeLists();
+    return true;
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    auto crd = GetDeviceAddress<const T>(inputs, 0);
+    auto quarter_cof = GetDeviceAddress<const T>(inputs, 1);
+    auto mass_inverse = GetDeviceAddress<const T>(inputs, 2);
+    auto scaler = GetDeviceAddress<const T>(inputs, 3);
+    auto pair_dr = GetDeviceAddress<const T>(inputs, 4);
+    auto atom_i_serials = GetDeviceAddress<const T1>(inputs, 5);
+    auto atom_j_serials = GetDeviceAddress<const T1>(inputs, 6);
+    auto constant_rs = GetDeviceAddress<const T>(inputs, 7);
+    auto constrain_ks = GetDeviceAddress<const T>(inputs, 8);
+    auto d_need_pressure = GetDeviceAddress<T1>(inputs, 9);
+
+    auto constrain_pair = GetDeviceAddress<T>(workspace, 0);
+
+    auto uint_crd = GetDeviceAddress<T2>(outputs, 0);
+
+    auto test_frc_f = GetDeviceAddress<T>(outputs, 1);
+    auto d_atom_virial = GetDeviceAddress<T>(outputs, 2);
+
+    cudaMemcpyAsync(&need_pressure, d_need_pressure, sizeof(int), cudaMemcpyDeviceToHost,
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
+    cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr));
+
+    set_zero_force_with_virial(atom_numbers, constrain_pair_numbers, test_frc_f, d_atom_virial,
+                               reinterpret_cast<cudaStream_t>(stream_ptr));
+
+    for (int i = 0; i < iteration_numbers; i++) {
+      refresh_uint_crd_update(atom_numbers, half_exp_gamma_plus_half, crd, quarter_cof, test_frc_f, mass_inverse,
+                              uint_crd, reinterpret_cast<cudaStream_t>(stream_ptr));
+
+      if (need_pressure) {
+        constrain_force_cycle_with_virial_update(atom_numbers, constrain_pair_numbers, uint_crd, scaler, constrain_pair,
+                                                 pair_dr, atom_i_serials, atom_j_serials, constant_rs, constrain_ks,
+                                                 test_frc_f, d_atom_virial, reinterpret_cast<cudaStream_t>(stream_ptr));
+      } else {
+        constrain_force_cycle_update(atom_numbers, constrain_pair_numbers, uint_crd, scaler, constrain_pair, pair_dr,
+                                     atom_i_serials, atom_j_serials, constant_rs, constrain_ks, test_frc_f,
+                                     reinterpret_cast<cudaStream_t>(stream_ptr));
+      }
+    }
+    step++;
+
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(ele_crd * sizeof(T));
+    input_size_list_.push_back(ele_quarter_cof * sizeof(T));
+    input_size_list_.push_back(ele_mass_inverse * sizeof(T));
+
+    input_size_list_.push_back(ele_scaler * sizeof(T));
+    input_size_list_.push_back(ele_pair_dr * sizeof(T));
+    input_size_list_.push_back(ele_atom_i_serials * sizeof(T1));
+    input_size_list_.push_back(ele_atom_j_serials * sizeof(T1));
+    input_size_list_.push_back(ele_constant_rs * sizeof(T));
+    input_size_list_.push_back(ele_constrain_ks * sizeof(T));
+
+    workspace_size_list_.push_back(constrain_pair_numbers * sizeof(CONSTRAIN_PAIR));
+
+    output_size_list_.push_back(3 * atom_numbers * sizeof(T2));
+
+    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
+    output_size_list_.push_back(constrain_pair_numbers * sizeof(T));
+  }
+
+ private:
+  size_t ele_scaler = 1;
+  size_t ele_pair_dr = 1;
+  size_t ele_atom_i_serials = 1;
+  size_t ele_atom_j_serials = 1;
+  size_t ele_constant_rs = 1;
+  size_t ele_constrain_ks = 1;
+  size_t ele_crd = 1;
+  size_t ele_quarter_cof = 1;
+  size_t ele_mass_inverse = 1;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  int atom_numbers;
+  int constrain_pair_numbers;
+  int iteration_numbers;
+  int need_pressure;
+  float half_exp_gamma_plus_half;
+  int step;
+  struct CONSTRAIN_PAIR {
+    int atom_i_serial;
+    int atom_j_serial;
+    float constant_r;
+    float constrain_k;
+  };
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/last_crd_to_dr_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/last_crd_to_dr_kernel.cc
@ -13,12 +13,15 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
+/**
+ *Note:
+ *  LastCrdToDr. This is an experimental interface that is subject to change and/or deletion.
+ */
 #include "backend/kernel_compiler/gpu/sponge/simple_constrain/last_crd_to_dr_kernel.h"

 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_TWO(lastcrdtodr,
+MS_REG_GPU_KERNEL_TWO(LastCrdToDr,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/refresh_crd_vel_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/refresh_crd_vel_kernel.cc
@ -13,12 +13,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+/**
+ *Note:
+ *  RefreshCrdVel. This is an experimental interface that is subject to change and/or deletion.
+ */

 #include "backend/kernel_compiler/gpu/sponge/simple_constrain/refresh_crd_vel_kernel.h"

 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_TWO(refreshcrdvel,
+MS_REG_GPU_KERNEL_TWO(RefreshCrdVel,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/refresh_uint_crd_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/simple_constrain/refresh_uint_crd_kernel.cc
@ -13,12 +13,15 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
+/**
+ *Note:
+ *  RefreshUintCrd. This is an experimental interface that is subject to change and/or deletion.
+ */
 #include "backend/kernel_compiler/gpu/sponge/simple_constrain/refresh_uint_crd_kernel.h"

 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_TWO(refreshuintcrd,
+MS_REG_GPU_KERNEL_TWO(RefreshUintCrd,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)