diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
index c9b7b1dcf27..16759760047 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -222,6 +222,8 @@ void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *
   }
 }
 
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1, bool *y,
+                         cudaStream_t stream);
 template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, bool *y,
                          cudaStream_t stream);
 template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, bool *y,
@@ -292,6 +294,8 @@ void ElewiseArith(const int &nums, enum BroadcastOpType op, const half *x0, cons
   }
 }
 
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const double *x0, const double *x1, double *y,
+                           cudaStream_t stream);
 template void ElewiseArith(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, float *y,
                            cudaStream_t stream);
 template void ElewiseArith(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, half *y,
@@ -372,6 +376,9 @@ void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t>
   }
 }
 
+template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
+                           const double *x1, bool *y, cudaStream_t stream);
 template void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
                            const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0, const float *x1,
                            bool *y, cudaStream_t stream);
@@ -501,6 +508,9 @@ void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t
   }
 }
 
+template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
+                             const std::vector<size_t> &y_dims, enum BroadcastOpType op, const double *x0,
+                             const double *x1, double *y, cudaStream_t stream);
 template void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
                              const std::vector<size_t> &y_dims, enum BroadcastOpType op, const float *x0,
                              const float *x1, float *y, cudaStream_t stream);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
index 1614b008f04..8d232afcd7f 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,20 @@
 
 namespace mindspore {
 namespace kernel {
+// fp64
+MS_REG_GPU_KERNEL_ONE(
+  Add, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+  BroadcastOpGpuKernel, double)
+MS_REG_GPU_KERNEL_ONE(
+  Sub, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+  BroadcastOpGpuKernel, double)
+MS_REG_GPU_KERNEL_ONE(
+  Mul, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+  BroadcastOpGpuKernel, double)
+MS_REG_GPU_KERNEL_ONE(
+  Div, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+  BroadcastOpGpuKernel, double)
+
 // fp32
 MS_REG_GPU_KERNEL_ONE(
   Greater,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/gpu_convert_to_dynamic_shape_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/gpu_convert_to_dynamic_shape_gpu_kernel.cc
index 71a6dea3498..ced980b667b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/gpu_convert_to_dynamic_shape_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/gpu_convert_to_dynamic_shape_gpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,10 @@ MS_REG_GPU_KERNEL_ONE(GpuConvertToDynamicShape,
                       KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                       GpuConvertToDynamicShapeGpuKernel, float)
 
+MS_REG_GPU_KERNEL_ONE(GpuConvertToDynamicShape,
+                      KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+                      GpuConvertToDynamicShapeGpuKernel, double)
+
 MS_REG_GPU_KERNEL_ONE(GpuConvertToDynamicShape,
                       KernelAttr().AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
                       GpuConvertToDynamicShapeGpuKernel, int8_t)
diff --git a/tests/st/ops/gpu/test_tensoradd.py b/tests/st/ops/gpu/test_add_op.py
similarity index 70%
rename from tests/st/ops/gpu/test_tensoradd.py
rename to tests/st/ops/gpu/test_add_op.py
index 5836618b592..23259728e4e 100644
--- a/tests/st/ops/gpu/test_tensoradd.py
+++ b/tests/st/ops/gpu/test_add_op.py
@@ -1,4 +1,4 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
+# Copyright 2019-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,34 +25,32 @@ from mindspore.common.parameter import Parameter
 from mindspore.ops import operations as P
 from mindspore.ops.operations import _inner_ops as inner
 
-context.set_context(device_target='GPU')
-
-
-class TensroAdd(nn.Cell):
-    def __init__(self):
-        super(TensroAdd, self).__init__()
+class AddNet(nn.Cell):
+    def __init__(self, nptype):
+        super(AddNet, self).__init__()
 
         self.add = P.Add()
 
+        np.random.seed(0)
         self.x = Parameter(initializer(
-            Tensor(np.random.randn(2, 0).astype(np.float32)), [2, 0]), name='x')
+            Tensor(np.random.randn(2, 0).astype(nptype)), [2, 0]), name='x')
         self.y = Parameter(initializer(
-            Tensor(np.random.randn(2, 1).astype(np.float32)), [2, 1]), name='y')
+            Tensor(np.random.randn(2, 1).astype(nptype)), [2, 1]), name='y')
 
         self.x1 = Parameter(initializer(
-            Tensor(np.arange(3).reshape(3).astype(np.float32)), [3]), name='x1')
+            Tensor(np.arange(3).reshape(3).astype(nptype)), [3]), name='x1')
         self.y1 = Parameter(initializer(
-            Tensor(np.array([2]).astype(np.float32)), [1]), name='y1')
+            Tensor(np.array([2]).astype(nptype)), [1]), name='y1')
 
         self.x2 = Parameter(initializer(
-            Tensor(np.arange(3 * 3 * 3 * 3).reshape(3, 3, 3, 3).astype(np.float32)), [3, 3, 3, 3]), name='x2')
+            Tensor(np.arange(3 * 3 * 3 * 3).reshape(3, 3, 3, 3).astype(nptype)), [3, 3, 3, 3]), name='x2')
         self.y2 = Parameter(initializer(
-            Tensor(np.arange(3 * 3 * 3 * 3).reshape(3, 3, 3, 3).astype(np.float32)), [3, 3, 3, 3]), name='y2')
+            Tensor(np.arange(3 * 3 * 3 * 3).reshape(3, 3, 3, 3).astype(nptype)), [3, 3, 3, 3]), name='y2')
 
         self.x3 = Parameter(initializer(
-            Tensor(np.arange(1 * 1 * 3 * 3).reshape(1, 1, 3, 3).astype(np.float32)), [1, 1, 3, 3]), name='x3')
+            Tensor(np.arange(1 * 1 * 3 * 3).reshape(1, 1, 3, 3).astype(nptype)), [1, 1, 3, 3]), name='x3')
         self.y3 = Parameter(initializer(
-            Tensor(np.arange(3 * 3 * 3 * 3).reshape(3, 3, 3, 3).astype(np.float32)), [3, 3, 3, 3]), name='y3')
+            Tensor(np.arange(3 * 3 * 3 * 3).reshape(3, 3, 3, 3).astype(nptype)), [3, 3, 3, 3]), name='y3')
 
     @ms_function
     def construct(self):
@@ -61,14 +59,13 @@ class TensroAdd(nn.Cell):
             self.add(self.x3, self.y3))
 
 
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_TensorAdd():
-    add = TensroAdd()
-    output = add()
+def add(nptype):
+    context.set_context(device_target='GPU')
+
+    add_net = AddNet(nptype)
+    output = add_net()
     expect0 = np.array([])
-    expect1 = np.array([2, 3, 4])
+    expect1 = np.array([2, 3, 4]).astype(nptype)
     expect2 = np.array(
         [[[[0., 2., 4.],
            [6., 8., 10.],
@@ -96,7 +93,7 @@ def test_TensorAdd():
            [138., 140., 142.]],
           [[144., 146., 148.],
            [150., 152., 154.],
-           [156., 158., 160.]]]])
+           [156., 158., 160.]]]]).astype(nptype)
     expect3 = np.array(
         [[[[0., 2., 4.],
            [6., 8., 10.],
@@ -124,13 +121,42 @@ def test_TensorAdd():
            [75., 77., 79.]],
           [[72., 74., 76.],
            [78., 80., 82.],
-           [84., 86., 88.]]]]
-    )
+           [84., 86., 88.]]]]).astype(nptype)
     assert (output[0].asnumpy() == expect0).all()
     assert (output[1].asnumpy() == expect1).all()
     assert (output[2].asnumpy() == expect2).all()
     assert (output[3].asnumpy() == expect3).all()
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_add_float64():
+    add(np.float64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_add_float32():
+    add(np.float32)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_add_float16():
+    add(np.float16)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_add_int64():
+    add(np.int64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_add_int32():
+    add(np.int32)
+
 class Tensoradd_d(nn.Cell):
     def __init__(self):
         super(Tensoradd_d, self).__init__()
@@ -142,18 +168,16 @@ class Tensoradd_d(nn.Cell):
         y = self.test_dynamic(y)
         return self.add(x, y)
 
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_TensorAdd_dynamic():
+
+def add_dynamic(nptype):
     context.set_context(device_target='GPU', mode=context.GRAPH_MODE)
     net = Tensoradd_d()
 
-    x1 = Tensor(np.arange(3).reshape(3).astype(np.float32))
-    y1 = Tensor(np.array([2]).astype(np.float32))
+    x1 = Tensor(np.arange(3).reshape(3).astype(nptype))
+    y1 = Tensor(np.array([2]).astype(nptype))
 
-    x2 = Tensor(np.arange(3 * 3 * 3 * 3).reshape(3, 3, 3, 3).astype(np.float32))
-    y2 = Tensor(np.arange(3 * 3 * 3 * 3).reshape(3, 3, 3, 3).astype(np.float32))
+    x2 = Tensor(np.arange(3 * 3 * 3 * 3).reshape(3, 3, 3, 3).astype(nptype))
+    y2 = Tensor(np.arange(3 * 3 * 3 * 3).reshape(3, 3, 3, 3).astype(nptype))
 
     expect1 = np.array([2, 3, 4])
     expect2 = np.array(
@@ -189,3 +213,33 @@ def test_TensorAdd_dynamic():
     output2 = net(x2, y2)
     assert (output1.asnumpy() == expect1).all()
     assert (output2.asnumpy() == expect2).all()
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_add_dynamic_float64():
+    add_dynamic(np.float64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_add_dynamic_float32():
+    add_dynamic(np.float32)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_add_dynamic_float16():
+    add_dynamic(np.float16)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_add_dynamic_int64():
+    add_dynamic(np.int64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_add_dynamic_int32():
+    add_dynamic(np.int32)
diff --git a/tests/st/ops/gpu/test_div_op.py b/tests/st/ops/gpu/test_div_op.py
index 03438e9305b..546bb5a9499 100644
--- a/tests/st/ops/gpu/test_div_op.py
+++ b/tests/st/ops/gpu/test_div_op.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,24 +29,17 @@ class NetDiv(nn.Cell):
     def construct(self, x, y):
         return self.div(x, y)
 
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_div():
-    x0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
-    y0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
-    x1_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
-    y1_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.float32)
-    x2_np = np.random.randint(1, 5, (2, 1, 1, 4)).astype(np.float32)
-    y2_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
-    x3_np = np.random.randint(1, 5, 1).astype(np.float32)
-    y3_np = np.random.randint(1, 5, 1).astype(np.float32)
-    x4_np = np.array(768).astype(np.float32)
-    y4_np = np.array(3072.5).astype(np.float32)
-    x5_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float16)
-    y5_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float16)
-    x6_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.int32)
-    y6_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.int32)
+def div(nptype):
+    x0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(nptype)
+    y0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(nptype)
+    x1_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(nptype)
+    y1_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(nptype)
+    x2_np = np.random.randint(1, 5, (2, 1, 1, 4)).astype(nptype)
+    y2_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(nptype)
+    x3_np = np.random.randint(1, 5, 1).astype(nptype)
+    y3_np = np.random.randint(1, 5, 1).astype(nptype)
+    x4_np = np.array(78).astype(nptype)
+    y4_np = np.array(37.5).astype(nptype)
 
     x0 = Tensor(x0_np)
     y0 = Tensor(y0_np)
@@ -58,28 +51,24 @@ def test_div():
     y3 = Tensor(y3_np)
     x4 = Tensor(x4_np)
     y4 = Tensor(y4_np)
-    x5 = Tensor(x5_np)
-    y5 = Tensor(y5_np)
-    x6 = Tensor(x6_np)
-    y6 = Tensor(y6_np)
 
     context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
-    div = NetDiv()
-    output0 = div(x0, y0)
+    div_net = NetDiv()
+    output0 = div_net(x0, y0)
     expect0 = np.divide(x0_np, y0_np)
     diff0 = output0.asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
     assert output0.shape == expect0.shape
 
-    output1 = div(x1, y1)
+    output1 = div_net(x1, y1)
     expect1 = np.divide(x1_np, y1_np)
     diff1 = output1.asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
     assert output1.shape == expect1.shape
 
-    output2 = div(x2, y2)
+    output2 = div_net(x2, y2)
     expect2 = np.divide(x2_np, y2_np)
     diff2 = output2.asnumpy() - expect2
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
@@ -87,30 +76,46 @@ def test_div():
     assert output2.shape == expect2.shape
 
     context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
-    output3 = div(x3, y3)
+    output3 = div_net(x3, y3)
     expect3 = np.divide(x3_np, y3_np)
     diff3 = output3.asnumpy() - expect3
     error3 = np.ones(shape=expect3.shape) * 1.0e-5
     assert np.all(diff3 < error3)
     assert output3.shape == expect3.shape
 
-    output4 = div(x4, y4)
+    output4 = div_net(x4, y4)
     expect4 = np.divide(x4_np, y4_np)
     diff4 = output4.asnumpy() - expect4
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
     assert np.all(diff4 < error4)
     assert output4.shape == expect4.shape
 
-    output5 = div(x5, y5)
-    expect5 = np.divide(x5_np, y5_np)
-    diff5 = output5.asnumpy() - expect5
-    error5 = np.ones(shape=expect5.shape) * 1.0e-5
-    assert np.all(diff5 < error5)
-    assert output5.shape == expect5.shape
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_div_float64():
+    div(np.float64)
 
-    output6 = div(x6, y6)
-    expect6 = np.divide(x6_np, y6_np)
-    diff6 = output6.asnumpy() - expect6
-    error6 = np.ones(shape=expect6.shape) * 1.0e-5
-    assert np.all(diff6 < error6)
-    assert output6.shape == expect6.shape
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_div_float32():
+    div(np.float32)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_div_float16():
+    div(np.float16)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_div_int64():
+    div(np.int64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_div_int32():
+    div(np.int32)
diff --git a/tests/st/ops/gpu/test_gpu_convert_to_dynamic_shape_op.py b/tests/st/ops/gpu/test_gpu_convert_to_dynamic_shape_op.py
index a89d0de851d..9738edf14a5 100644
--- a/tests/st/ops/gpu/test_gpu_convert_to_dynamic_shape_op.py
+++ b/tests/st/ops/gpu/test_gpu_convert_to_dynamic_shape_op.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -63,6 +63,12 @@ def gpu_convert_to_dynamic_shape_float(dtype):
 
     np.random.seed(0)
     finfo = np.finfo(dtype)
+
+    # np.random.uniform will overflow if we use min/max for float64, so we use
+    # the finfo for float32, but still test the operator with float64 input.
+    if dtype == np.float64:
+        finfo = np.finfo(np.float32)
+
     float_min = finfo.min
     float_max = finfo.max
     x = np.random.uniform(low=float_min, high=float_max, size=12).astype(dtype)
@@ -103,6 +109,12 @@ def test_gpu_convert_to_dynamic_shape_float16():
 def test_gpu_convert_to_dynamic_shape_float32():
     gpu_convert_to_dynamic_shape_float(np.float32)
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_gpu_convert_to_dynamic_shape_float64():
+    gpu_convert_to_dynamic_shape_float(np.float64)
+
 @pytest.mark.level0
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
diff --git a/tests/st/ops/gpu/test_mul_op.py b/tests/st/ops/gpu/test_mul_op.py
index 884138cc916..5a084b01d12 100644
--- a/tests/st/ops/gpu/test_mul_op.py
+++ b/tests/st/ops/gpu/test_mul_op.py
@@ -1,4 +1,4 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
+# Copyright 2019-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,20 +31,17 @@ class NetMul(nn.Cell):
         return self.mul(x, y)
 
 
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_mul():
-    x0_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
-    y0_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
-    x1_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
-    y1_np = np.random.uniform(-2, 2, (2, 1, 4, 4)).astype(np.float32)
-    x2_np = np.random.uniform(-2, 2, (2, 1, 1, 4)).astype(np.float32)
-    y2_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
-    x3_np = np.random.uniform(-2, 2, 1).astype(np.float32)
-    y3_np = np.random.uniform(-2, 2, 1).astype(np.float32)
-    x4_np = np.array(768).astype(np.float32)
-    y4_np = np.array(3072.5).astype(np.float32)
+def mul(nptype):
+    x0_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(nptype)
+    y0_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(nptype)
+    x1_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(nptype)
+    y1_np = np.random.uniform(-2, 2, (2, 1, 4, 4)).astype(nptype)
+    x2_np = np.random.uniform(-2, 2, (2, 1, 1, 4)).astype(nptype)
+    y2_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(nptype)
+    x3_np = np.random.uniform(-2, 2, 1).astype(nptype)
+    y3_np = np.random.uniform(-2, 2, 1).astype(nptype)
+    x4_np = np.array(78).astype(nptype)
+    y4_np = np.array(37.5).astype(nptype)
 
     x0 = Tensor(x0_np)
     y0 = Tensor(y0_np)
@@ -58,36 +55,36 @@ def test_mul():
     y4 = Tensor(y4_np)
 
     context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
-    mul = NetMul()
-    output0 = mul(x0, y0)
+    mul_net = NetMul()
+    output0 = mul_net(x0, y0)
     expect0 = np.multiply(x0_np, y0_np)
     diff0 = output0.asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
     assert output0.shape == expect0.shape
 
-    output1 = mul(x1, y1)
+    output1 = mul_net(x1, y1)
     expect1 = np.multiply(x1_np, y1_np)
     diff1 = output1.asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
     assert output1.shape == expect1.shape
 
-    output2 = mul(x2, y2)
+    output2 = mul_net(x2, y2)
     expect2 = np.multiply(x2_np, y2_np)
     diff2 = output2.asnumpy() - expect2
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
     assert output2.shape == expect2.shape
 
-    output3 = mul(x3, y3)
+    output3 = mul_net(x3, y3)
     expect3 = np.multiply(x3_np, y3_np)
     diff3 = output3.asnumpy() - expect3
     error3 = np.ones(shape=expect3.shape) * 1.0e-5
     assert np.all(diff3 < error3)
     assert output3.shape == expect3.shape
 
-    output4 = mul(x4, y4)
+    output4 = mul_net(x4, y4)
     expect4 = np.multiply(x4_np, y4_np)
     diff4 = output4.asnumpy() - expect4
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
@@ -95,42 +92,72 @@ def test_mul():
     assert output4.shape == expect4.shape
 
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    mul = NetMul()
-    output0 = mul(x0, y0)
+    mul_net = NetMul()
+    output0 = mul_net(x0, y0)
     expect0 = np.multiply(x0_np, y0_np)
     diff0 = output0.asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
     assert output0.shape == expect0.shape
 
-    output1 = mul(x1, y1)
+    output1 = mul_net(x1, y1)
     expect1 = np.multiply(x1_np, y1_np)
     diff1 = output1.asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
     assert output1.shape == expect1.shape
 
-    output2 = mul(x2, y2)
+    output2 = mul_net(x2, y2)
     expect2 = np.multiply(x2_np, y2_np)
     diff2 = output2.asnumpy() - expect2
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
     assert output2.shape == expect2.shape
 
-    output3 = mul(x3, y3)
+    output3 = mul_net(x3, y3)
     expect3 = np.multiply(x3_np, y3_np)
     diff3 = output3.asnumpy() - expect3
     error3 = np.ones(shape=expect3.shape) * 1.0e-5
     assert np.all(diff3 < error3)
     assert output3.shape == expect3.shape
 
-    output4 = mul(x4, y4)
+    output4 = mul_net(x4, y4)
     expect4 = np.multiply(x4_np, y4_np)
     diff4 = output4.asnumpy() - expect4
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
     assert np.all(diff4 < error4)
     assert output4.shape == expect4.shape
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mul_float64():
+    mul(np.float64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mul_float32():
+    mul(np.float32)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mul_float16():
+    mul(np.float16)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mul_int64():
+    mul(np.int64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mul_int32():
+    mul(np.int32)
+
 class NetMul_dynamic(nn.Cell):
     def __init__(self):
         super(NetMul_dynamic, self).__init__()
@@ -143,14 +170,12 @@ class NetMul_dynamic(nn.Cell):
         out = self.mul(x, y)
         return out
 
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_mul_dynamic():
-    x1_np = np.array([768]).astype(np.float32)
-    y1_np = np.array([3072.5]).astype(np.float32)
-    x2_np = np.random.uniform(-2, 2, (2, 1, 1, 4)).astype(np.float32)
-    y2_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
+
+def mul_dynamic(nptype):
+    x1_np = np.array([78]).astype(nptype)
+    y1_np = np.array([37.5]).astype(nptype)
+    x2_np = np.random.uniform(-2, 2, (2, 1, 1, 4)).astype(nptype)
+    y2_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(nptype)
 
     x1 = Tensor(x1_np)
     y1 = Tensor(y1_np)
@@ -159,10 +184,10 @@ def test_mul_dynamic():
 
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
 
-    mul = NetMul_dynamic()
+    mul_net = NetMul_dynamic()
 
-    output1 = mul(x1, y1)
-    output2 = mul(x2, y2)
+    output1 = mul_net(x1, y1)
+    output2 = mul_net(x2, y2)
     expect1 = np.multiply(x1_np, y1_np)
     expect2 = np.multiply(x2_np, y2_np)
     diff1 = output1.asnumpy() - expect1
@@ -173,3 +198,33 @@ def test_mul_dynamic():
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
     assert output2.shape == expect2.shape
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mul_dynamic_float64():
+    mul_dynamic(np.float64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mul_dynamic_float32():
+    mul_dynamic(np.float32)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mul_dynamic_float16():
+    mul_dynamic(np.float16)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mul_dynamic_int64():
+    mul_dynamic(np.int64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mul_dynamic_int32():
+    mul_dynamic(np.int32)
diff --git a/tests/st/ops/gpu/test_sub_op.py b/tests/st/ops/gpu/test_sub_op.py
index 3ba03c6586f..1947d8e668f 100644
--- a/tests/st/ops/gpu/test_sub_op.py
+++ b/tests/st/ops/gpu/test_sub_op.py
@@ -1,4 +1,4 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
+# Copyright 2019-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,20 +31,17 @@ class Net(nn.Cell):
         return self.sub(x, y)
 
 
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_Sub():
-    np_x0 = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
-    np_y0 = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
-    np_x1 = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
-    np_y1 = np.random.uniform(-2, 2, (2, 1, 4, 4)).astype(np.float32)
-    np_x2 = np.random.uniform(-2, 2, (2, 1, 1, 4)).astype(np.float32)
-    np_y2 = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
-    np_x3 = np.random.uniform(-2, 2, 1).astype(np.float32)
-    np_y3 = np.random.uniform(-2, 2, 1).astype(np.float32)
-    np_x4 = np.array(768).astype(np.float32)
-    np_y4 = np.array(3072.5).astype(np.float32)
+def sub(nptype):
+    np_x0 = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(nptype)
+    np_y0 = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(nptype)
+    np_x1 = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(nptype)
+    np_y1 = np.random.uniform(-2, 2, (2, 1, 4, 4)).astype(nptype)
+    np_x2 = np.random.uniform(-2, 2, (2, 1, 1, 4)).astype(nptype)
+    np_y2 = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(nptype)
+    np_x3 = np.random.uniform(-2, 2, 1).astype(nptype)
+    np_y3 = np.random.uniform(-2, 2, 1).astype(nptype)
+    np_x4 = np.array(768).astype(nptype)
+    np_y4 = np.array(3072.5).astype(nptype)
     x0 = Tensor(np_x0)
     y0 = Tensor(np_y0)
     x1 = Tensor(np_x1)
@@ -68,12 +65,12 @@ def test_Sub():
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
 
     context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
-    sub = Net()
-    output0 = sub(x0, y0)
-    output1 = sub(x1, y1)
-    output2 = sub(x2, y2)
-    output3 = sub(x3, y3)
-    output4 = sub(x4, y4)
+    sub_net = Net()
+    output0 = sub_net(x0, y0)
+    output1 = sub_net(x1, y1)
+    output2 = sub_net(x2, y2)
+    output3 = sub_net(x3, y3)
+    output4 = sub_net(x4, y4)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
     assert output0.shape == expect0.shape
@@ -91,12 +88,12 @@ def test_Sub():
     assert output4.shape == expect4.shape
 
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    sub = Net()
-    output0 = sub(x0, y0)
-    output1 = sub(x1, y1)
-    output2 = sub(x2, y2)
-    output3 = sub(x3, y3)
-    output4 = sub(x4, y4)
+    sub_net = Net()
+    output0 = sub_net(x0, y0)
+    output1 = sub_net(x1, y1)
+    output2 = sub_net(x2, y2)
+    output3 = sub_net(x3, y3)
+    output4 = sub_net(x4, y4)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
     assert output0.shape == expect0.shape
@@ -112,3 +109,33 @@ def test_Sub():
     diff4 = output4.asnumpy() - expect4
     assert np.all(diff4 < error4)
     assert output4.shape == expect4.shape
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_sub_float64():
+    sub(np.float64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_sub_float32():
+    sub(np.float32)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_sub_float16():
+    sub(np.float16)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_sub_int64():
+    sub(np.int64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_sub_int32():
+    sub(np.int32)