diff --git a/mindspore/ccsrc/pipeline/jit/init.cc b/mindspore/ccsrc/pipeline/jit/init.cc
index be3a8881b27..1622adfc36c 100644
--- a/mindspore/ccsrc/pipeline/jit/init.cc
+++ b/mindspore/ccsrc/pipeline/jit/init.cc
@@ -26,6 +26,7 @@
 #include "utils/summary/event_writer.h"
 #include "utils/config_manager.h"
 #include "utils/mpi/mpi_config.h"
+#include "utils/ms_utils.h"
 #include "frontend/parallel/context.h"
 #include "frontend/parallel/costmodel_context.h"
 #ifdef ENABLE_GPU_COLLECTIVE
@@ -57,6 +58,9 @@ using PSContext = mindspore::ps::PSContext;
 
 // Interface with python
 PYBIND11_MODULE(_c_expression, m) {
+  // The OMP_NUM_THREADS has no effect when set in backend, so set it here in advance.
+  mindspore::common::SetOMPThreadNum();
+
   m.doc() = "MindSpore c plugin";
 
   auto fns = mindspore::PybindDefineRegister::AllFuncs();
diff --git a/mindspore/ccsrc/vm/transform.cc b/mindspore/ccsrc/vm/transform.cc
index 38241fa17cc..7eb82a04650 100644
--- a/mindspore/ccsrc/vm/transform.cc
+++ b/mindspore/ccsrc/vm/transform.cc
@@ -567,9 +567,14 @@ void SetMindRTEnable() {
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
   std::string target = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
-  if (target != kGPUDevice) {
+  if ((target != kGPUDevice) && (target != kCPUDevice)) {
     return;
   }
+
+#if defined(_WIN32) || defined(_WIN64)
+  return;
+#endif
+
 #if (ENABLE_CPU && !_WIN32)
   if (ps::PSContext::instance()->is_ps_mode()) {
     return;
diff --git a/mindspore/core/utils/ms_utils.h b/mindspore/core/utils/ms_utils.h
index cccfb117f22..0982584a190 100644
--- a/mindspore/core/utils/ms_utils.h
+++ b/mindspore/core/utils/ms_utils.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 #include <atomic>
+#include <thread>
 
 #define DISABLE_COPY_AND_ASSIGN(ClassType) \
   ClassType(const ClassType &) = delete;   \
@@ -48,6 +49,19 @@ static inline int SetEnv(const char *envname, const char *envvar, int overwrite
   return ::setenv(envname, envvar, overwrite);
 #endif
 }
+
+static inline void SetOMPThreadNum() {
+  size_t cpu_core_num = std::thread::hardware_concurrency();
+  size_t cpu_core_num_half = cpu_core_num / 2;
+  const size_t kOMPThreadMaxNum = 16;
+  const size_t kOMPThreadMinNum = 1;
+
+  size_t OMP_thread_num = cpu_core_num_half < kOMPThreadMinNum ? kOMPThreadMinNum : cpu_core_num_half;
+  OMP_thread_num = OMP_thread_num > kOMPThreadMaxNum ? kOMPThreadMaxNum : OMP_thread_num;
+
+  std::string OMP_env = std::to_string(OMP_thread_num);
+  SetEnv("OMP_NUM_THREADS", OMP_env.c_str(), 0);
+}
 }  // namespace common
 }  // namespace mindspore
 
diff --git a/tests/st/dump/test_data_dump.py b/tests/st/dump/test_data_dump.py
index 6c6dae8c014..1da98e73942 100644
--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@@ -87,10 +87,11 @@ def run_e2e_dump():
     add = Net()
     add(Tensor(x), Tensor(y))
     time.sleep(5)
-    assert len(os.listdir(dump_file_path)) == 5
     if context.get_context("device_target") == "Ascend":
+        assert len(os.listdir(dump_file_path)) == 5
         output_name = "Add.Add-op1.0.0.*.output.0.DefaultFormat.npy"
     else:
+        assert len(os.listdir(dump_file_path)) == 3
         output_name = "Add.Add-op3.0.0.*.output.0.DefaultFormat.npy"
     output_path = glob.glob(dump_file_path + output_name)[0]
     real_path = os.path.realpath(output_path)
@@ -116,6 +117,13 @@ def test_cpu_e2e_dump():
     context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
     run_e2e_dump()
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_gpu_e2e_dump():
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    run_e2e_dump()
+
 class ReluReduceMeanDenseRelu(Cell):
     def __init__(self, kernel, bias, in_channel, num_class):
         super().__init__()