refactor data queue

Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
2022-08-11 10:33:47 +08:00 · 2022-08-11 10:33:47 +08:00 · e0c7fa6136
parent 4da277798e
commit e0c7fa6136
58 changed files with 1302 additions and 1698 deletions
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@ -81,7 +81,6 @@ if(ENABLE_D)
 endif()

 if(ENABLE_GPU)
-    set(ENABLE_GPUQUE ON)
    add_compile_definitions(ENABLE_GPU_COLLECTIVE)
 endif()

--- a/mindspore/ccsrc/backend/graph_compiler/backend.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/backend.cc
@ -545,7 +545,7 @@ MindRTBackend::MindRTBackend(const std::string &backend_name, const std::string
  root_graph_ = nullptr;
  auto ms_context = MsContext::GetInstance();
  const bool pynative_mode = (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode);
-  auto &cut_list = pynative_mode ? compile::control_ops : GetMsNonlinearOps();
+  auto &cut_list = pynative_mode ? GetControlOps() : GetMsNonlinearOps();

  graph_partition_ = std::make_shared<GraphPartition>(cut_list, backend_name);
  graph_compiler_ = std::make_shared<GraphCompiler>();
--- a/mindspore/ccsrc/backend/graph_compiler/transform.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/transform.cc
@ -40,11 +40,17 @@ using PrimTypePair = std::pair<PrimitivePtr, AbstractFunctionPtr>;
 using MapPrimTypeFuncGraph = std::map<PrimTypePair, FuncGraphPtr>;
 using TypedPrimitiveAbstractClosurePtr = std::shared_ptr<abstract::TypedPrimitiveAbstractClosure>;

-std::vector<PrimitivePtr> nonlinear_ops = {prim::kPrimReturn, prim::kPrimPartial, prim::kPrimSwitch,
-                                           prim::kPrimMakeTuple, prim::kPrimBpropCut};
+const std::vector<PrimitivePtr> &GetNonlinearOps() {
+  static std::vector<PrimitivePtr> nonlinear_ops = {prim::kPrimReturn, prim::kPrimPartial, prim::kPrimSwitch,
+                                                    prim::kPrimMakeTuple, prim::kPrimBpropCut};
+  return nonlinear_ops;
+}

-std::vector<PrimitivePtr> control_ops = {prim::kPrimReturn, prim::kPrimPartial, prim::kPrimSwitch, prim::kPrimMakeTuple,
-                                         prim::kPrimSwitchLayer};
+const std::vector<PrimitivePtr> &GetControlOps() {
+  static std::vector<PrimitivePtr> control_ops = {prim::kPrimReturn, prim::kPrimPartial, prim::kPrimSwitch,
+                                                  prim::kPrimMakeTuple, prim::kPrimSwitchLayer};
+  return control_ops;
+}

 const std::vector<PrimitivePtr> &GetMsNonlinearOps() {
  static const std::vector<PrimitivePtr> ms_nonlinear_ops = {prim::kPrimReturn,   prim::kPrimPartial,
--- a/mindspore/ccsrc/backend/graph_compiler/transform.h
+++ b/mindspore/ccsrc/backend/graph_compiler/transform.h
@ -43,8 +43,8 @@ extern const char kGeVm[];
 // compile namespace
 // A sub namespace in ME to support compile related definition.
 namespace compile {
-BACKEND_EXPORT extern std::vector<PrimitivePtr> nonlinear_ops;
-BACKEND_EXPORT extern std::vector<PrimitivePtr> control_ops;
+BACKEND_EXPORT const std::vector<PrimitivePtr> &GetNonlinearOps();
+BACKEND_EXPORT const std::vector<PrimitivePtr> &GetControlOps();
 BACKEND_EXPORT const std::vector<PrimitivePtr> &GetMsNonlinearOps();
 FuncGraphPtr WrapPrimitives(const FuncGraphPtr &graph);
 using VmEvalFunc = std::function<BaseRef(const VectorRef &)>;
@ -52,7 +52,7 @@ using VmEvalFuncPtr = std::shared_ptr<std::function<BaseRef(const VectorRef &)>>

 class BACKEND_EXPORT CompileGraph {
 public:
-  explicit CompileGraph(const BackendPtr &backend, const std::vector<PrimitivePtr> &cut_list = nonlinear_ops);
+  explicit CompileGraph(const BackendPtr &backend, const std::vector<PrimitivePtr> &cut_list = GetNonlinearOps());

  virtual ~CompileGraph() = default;

@ -113,7 +113,7 @@ using CompileGraphPtr = std::shared_ptr<CompileGraph>;
 // CompileGraphs is used to Convert a graph cluster into instruction lists.
 class BACKEND_EXPORT CompileGraphs {
 public:
-  explicit CompileGraphs(const BackendPtr &backend, const std::vector<PrimitivePtr> &cut_list = nonlinear_ops);
+  explicit CompileGraphs(const BackendPtr &backend, const std::vector<PrimitivePtr> &cut_list = GetNonlinearOps());

  virtual ~CompileGraphs() = default;

--- a/mindspore/ccsrc/include/backend/data_queue/blocking_queue.h
+++ b/mindspore/ccsrc/include/backend/data_queue/blocking_queue.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_BLOCKING_QUEUE_H_
-#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_BLOCKING_QUEUE_H_
+#ifndef MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_BLOCKING_QUEUE_H
+#define MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_BLOCKING_QUEUE_H

 #include <unistd.h>
 #include <iostream>
@ -25,7 +25,7 @@
 #include <vector>
 #include <condition_variable>
 #include <functional>
-#include "runtime/data_queue/data_queue.h"
+#include "include/backend/data_queue/data_queue.h"
 namespace mindspore {
 namespace device {
 class BlockingQueue {
@ -33,15 +33,16 @@ class BlockingQueue {
  BlockingQueue() : queue_(nullptr) {}
  ~BlockingQueue() = default;

-  BlockQueueStatus_T Create(const std::shared_ptr<DataQueue> &data_queue);
+  DataQueueStatus Create(const std::shared_ptr<DataQueue> &data_queue);
  void RegisterRelease(const std::function<void(void *, int32_t)> &func);
-  BlockQueueStatus_T Push(const std::vector<DataQueueItem> &data, unsigned int timeout_in_sec);
-  BlockQueueStatus_T Front(std::vector<DataQueueItem> *data);
-  BlockQueueStatus_T Pop();
-  BlockQueueStatus_T Clear();
+  DataQueueStatus Push(const std::vector<DataQueueItem> &data, unsigned int timeout_in_sec);
+  DataQueueStatus Front(std::vector<DataQueueItem> *data);
+  DataQueueStatus Pop();
+  DataQueueStatus Clear();
  bool Destroy();
  size_t Size() { return queue_->Size(); }
  size_t Capacity() { return queue_->Capacity(); }
+  const std::shared_ptr<DataQueue> &Queue() const { return queue_; }

 private:
  std::mutex mutex_;
@ -51,5 +52,4 @@ class BlockingQueue {
 };
 }  // namespace device
 }  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_BLOCKING_QUEUE_H_
+#endif  // MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_BLOCKING_QUEUE_H
--- a/mindspore/ccsrc/include/backend/data_queue/data_queue.h
+++ b/mindspore/ccsrc/include/backend/data_queue/data_queue.h
@ -14,19 +14,30 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_DATA_QUEUE_H_
-#define MINDSPORE_CCSRC_RUNTIME_DEVICE_DATA_QUEUE_H_
+#ifndef MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_H
+#define MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_H

 #include <unistd.h>
 #include <string>
 #include <memory>
 #include <vector>
 #include <functional>
-#include "runtime/hardware/device_context_manager.h"
-#include "mindspore/core/utils/data_queue_handler.h"

 namespace mindspore {
 namespace device {
+class DeviceContext;
+
+enum class DataQueueStatus : int { SUCCESS = 0, QUEUE_EXIST, QUEUE_NOT_EXIST, ERROR_INPUT, INTERNAL_ERROR, TIMEOUT };
+
+struct DataQueueItem {
+  int32_t worker_id_{0};
+  std::string data_type_;
+  size_t data_len_{0};
+  void *data_ptr_{nullptr};
+  std::vector<int64_t> shapes_;
+  void *device_addr_{nullptr};
+};
+
 class DataQueue {
 public:
  explicit DataQueue(const size_t capacity);
@ -34,16 +45,21 @@ class DataQueue {

  virtual void RegisterRelease(const std::function<void(void *, int32_t)> &func) { host_release_ = func; }

+  virtual bool IsOpen() const { return true; }
  virtual bool IsEmpty() const { return size_ == 0; }
  virtual bool IsFull() const { return size_ == capacity_; }

-  virtual BlockQueueStatus_T Push(std::vector<DataQueueItem> data) = 0;
-  virtual BlockQueueStatus_T Front(std::vector<DataQueueItem> *data) const = 0;
-  virtual BlockQueueStatus_T Pop() = 0;
+  virtual DataQueueStatus Push(std::vector<DataQueueItem> data) = 0;
+  virtual DataQueueStatus Front(std::vector<DataQueueItem> *data) const = 0;
+  virtual DataQueueStatus Pop() = 0;
  virtual bool Destroy() = 0;
+  virtual void SetThreadDevice() = 0;
+
  virtual size_t Size() { return size_; }
  virtual size_t Capacity() { return capacity_; }

+  virtual std::shared_ptr<void> AllocHostMem(size_t size) { return std::shared_ptr<void>(::malloc(size), ::free); }
+
 protected:
  size_t head_;
  size_t tail_;
@ -59,5 +75,4 @@ class DataQueue {
 };
 }  // namespace device
 }  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_BLOCKING_QUEUE_H_
+#endif  // MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_H
--- a/mindspore/ccsrc/include/backend/data_queue/data_queue_mgr.h
+++ b/mindspore/ccsrc/include/backend/data_queue/data_queue_mgr.h
@ -0,0 +1,156 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_MGR_H
+#define MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_MGR_H
+
+#include <unistd.h>
+#include <iostream>
+#include <functional>
+#include <map>
+#include <vector>
+#include <string>
+#include <memory>
+#include <mutex>
+#include <condition_variable>
+#include "include/backend/visible.h"
+#include "include/backend/data_queue/data_queue.h"
+#ifndef BUILD_LITE
+#include "ir/anf.h"
+#endif
+
+namespace mindspore {
+namespace device {
+constexpr unsigned int MAX_WAIT_TIME_IN_SEC = 60;
+
+class BlockingQueue;
+
+// channel_name, dynamic_shape, capacity, addr, shape
+using DataQueueCreator =
+  std::function<std::shared_ptr<DataQueue>(const std::string &, bool, size_t, void *, const std::vector<size_t> &)>;
+
+class Semaphore {
+ public:
+  explicit Semaphore(int count = 0) : count_(count) {}
+
+  inline void Signal() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    ++count_;
+    cv_.notify_one();
+  }
+
+  inline bool Wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (count_ == 0) {
+      if (cv_.wait_for(lock, std::chrono::seconds(MAX_WAIT_TIME_IN_SEC)) == std::cv_status::timeout) {
+        return false;
+      }
+    }
+    --count_;
+    return true;
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  int count_;
+};
+
+class BACKEND_EXPORT DataQueueMgr {
+ public:
+  DataQueueMgr() : init_(false), closed_(false), open_by_dataset_(0) {}
+
+  virtual ~DataQueueMgr() = default;
+
+  static DataQueueMgr &GetInstance() noexcept;
+  void RegisterDataQueueCreator(const std::string &device_name, DataQueueCreator &&creator);
+  std::shared_ptr<DataQueue> CreateDataQueue(const std::string &device_name, const std::string &channel_name,
+                                             bool dynamic_shape, size_t capacity, void *addr,
+                                             const std::vector<size_t> &shape);
+
+  DataQueueStatus Create(const std::string &channel_name, void *addr, const std::vector<size_t> &shape,
+                         const size_t &capacity);
+
+  // call for Push thread
+  DataQueueStatus Open(const std::string &channel_name, std::function<void(void *, int32_t)> func);
+
+  // call for Front/Pop thread
+  DataQueueStatus Open(const std::string &channel_name) const;
+  DataQueueStatus Push(const std::string &channel_name, const std::vector<DataQueueItem> &data,
+                       unsigned int timeout_in_sec);
+  DataQueueStatus Front(const std::string &channel_name, std::vector<DataQueueItem> *data);
+  DataQueueStatus Pop(const std::string &channel_name);
+  DataQueueStatus Clear(const std::string &channel_name);
+
+  DataQueueStatus OpenDynamicBufQueue(const std::string &channel_name, const std::function<void(void *, int32_t)> func);
+  DataQueueStatus CreateDynamicBufQueue(const std::string &channel_name, const size_t &capacity);
+  DataQueueStatus OpenDynamicBufQueue(const std::string &channel_name);
+  std::shared_ptr<DataQueue> GetDataQueue(const std::string &channel_name) const;
+  DataQueueStatus SetThreadDevice(const std::string &device_name);
+  std::shared_ptr<void> AllocHostMem(const std::string &device_name, size_t size);
+
+  void Close(const std::string &channel_name) const noexcept;
+
+  bool IsInit() const;
+
+  bool IsClosed() const;
+
+  bool Destroy();
+
+  // call for Release GPU Resources
+  bool CloseNotify();
+
+  // call for dataset send thread
+  void CloseConfirm();
+
+  size_t Size(const std::string &channel_name);
+
+  size_t Capacity(const std::string &channel_name);
+
+ private:
+  inline bool isCreated(const std::string &channel_name) const;
+
+  DataQueueMgr(const DataQueueMgr &) = delete;
+  DataQueueMgr &operator=(const DataQueueMgr &) = delete;
+
+  bool init_;
+  bool closed_;
+  std::mutex mutex_;
+  std::mutex close_mutex_;
+  std::condition_variable cv_;
+  // how many queues opened by dataset
+  int open_by_dataset_;
+  Semaphore sema;
+  bool dynamic_shape_{false};
+  size_t default_capacity_{2};
+
+  std::map<std::string, std::shared_ptr<BlockingQueue>> name_queue_map_;
+
+  std::map<std::string, DataQueueCreator> data_queue_creator_map_ = {};  // key: device name, value: DataQueueCreator
+};
+#ifndef BUILD_LITE
+bool PopDataFromDataQueue(const AnfNodePtr &data_kernel);
+#endif
+#define REGISTER_DATA_QUEUE_CREATOR(device_name, creator)                         \
+  struct device_name##DataQueueCreatorClass {                                     \
+    device_name##DataQueueCreatorClass() {                                        \
+      DataQueueMgr::GetInstance().RegisterDataQueueCreator(device_name, creator); \
+    }                                                                             \
+  } g_##device_name##_data_queue_creator;
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_MGR_H
--- a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
@ -24,10 +24,6 @@ if(ENABLE_ACL)
    add_definitions(-D ENABLE_ACL)
    message(STATUS "ACL module is enabled")
 endif()
-if(ENABLE_GPUQUE)
-    add_definitions(-D ENABLE_GPUQUE)
-    message(STATUS "GPU queue is enabled")
-endif()
 if(ENABLE_TDTQUE)
    add_definitions(-D ENABLE_TDTQUE)
    message(STATUS "TDT queue is enabled")
@ -119,9 +115,6 @@ endif()
 if(NOT ENABLE_SECURITY)
    add_dependencies(engine-perf core)
 endif()
-if(ENABLE_TDTQUE)
-    add_dependencies(engine-device-queue core)
-endif()

 # for proto/cache_grpc.pb.h dependency
 if(ENABLE_CACHE)
@ -204,11 +197,6 @@ if(ENABLE_TDTQUE)
    if(NOT ENABLE_SECURITY)
        set(dataengine_submodules ${dataengine_submodules} $<TARGET_OBJECTS:engine-perf>)
    endif()
-    set(dataengine_submodules ${dataengine_submodules} $<TARGET_OBJECTS:engine-device-queue>)
-    # tdt impl
-    set(dataengine_submodules ${dataengine_submodules} $<TARGET_OBJECTS:engine-tdt>)
-    # host queue impl
-    set(dataengine_submodules ${dataengine_submodules} $<TARGET_OBJECTS:engine-host-queue>)
 else()
    if(NOT ENABLE_SECURITY)
        set(dataengine_submodules ${dataengine_submodules} $<TARGET_OBJECTS:engine-perf>)
@ -251,16 +239,7 @@ target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::turb
                                          mindspore::opencv_imgcodecs mindspore::opencv_imgproc mindspore::tinyxml2
                                          mindspore::sentencepiece_train ${ICU_LIB})

-if(ENABLE_GPUQUE)
-    target_link_libraries(_c_dataengine PRIVATE
-                          ${CUDNN_LIBRARY_PATH}
-                          ${CUDA_PATH}/lib64/libcudart.so
-                          ${CUDA_PATH}/lib64/stubs/libcuda.so)
-endif()
-
-if(ENABLE_TDTQUE)
-    target_link_libraries(_c_dataengine PRIVATE ${ACL} ${ACL_TDT_CHANNEL})
-endif()
+target_link_libraries(_c_dataengine PRIVATE mindspore_backend)

 add_dependencies(_c_dataengine _c_mindrecord)
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
--- a/mindspore/ccsrc/minddata/dataset/engine/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/engine/CMakeLists.txt
@ -10,10 +10,6 @@ endif()

 add_subdirectory(cache)

-if(ENABLE_TDTQUE)
-    add_subdirectory(device_queue_impl)
-endif()
-
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 set(SRC_FILES_LIST
@ -67,7 +63,3 @@ if(NOT ENABLE_SECURITY)
        add_dependencies(engine-perf engine-cache-client)
    endif()
 endif()
-
-if(ENABLE_TDTQUE)
-    add_dependencies(engine engine-device-queue)
-endif()
--- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc
@ -35,7 +35,9 @@
 #include "minddata/mindrecord/include/shard_header.h"
 #include "minddata/mindrecord/include/shard_writer.h"
 #endif
-
+#ifdef WITH_BACKEND
+#include "utils/ms_context.h"
+#endif
 namespace mindspore {
 namespace dataset {
 #ifndef ENABLE_SECURITY
@ -355,18 +357,19 @@ Status TreeConsumer::Reset(int64_t step) {
 #endif
    RETURN_IF_NOT_OK(this->Terminate());
  }
-
-#ifdef ENABLE_GPUQUE
-  // clear the device if GPU is used.
-  std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
-  CHECK_FAIL_RETURN_UNEXPECTED(root != nullptr, "Root is a nullptr.");
-  DeviceQueueOp *op = dynamic_cast<DeviceQueueOp *>(root.get());
-  if (op != nullptr) {
-    MS_LOG(INFO) << "Clearing the GPU device";
-    RETURN_IF_NOT_OK(op->ClearDevice());
+#ifdef WITH_BACKEND
+  MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+  if (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
+    // clear the device if GPU is used.
+    std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
+    CHECK_FAIL_RETURN_UNEXPECTED(root != nullptr, "Root is a nullptr.");
+    DeviceQueueOp *op = dynamic_cast<DeviceQueueOp *>(root.get());
+    if (op != nullptr) {
+      MS_LOG(INFO) << "Clearing the GPU device";
+      RETURN_IF_NOT_OK(op->ClearDevice());
+    }
  }
 #endif
-
  tree_adapter_ = std::make_unique<TreeAdapter>(TreeAdapter::UsageFlag::kDeReset);
  RETURN_IF_NOT_OK(tree_adapter_->Compile(old_root, num_epochs_, step));
  RETURN_IF_NOT_OK(tree_adapter_->Launch());
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
@ -21,12 +21,33 @@
 #include <memory>
 #include <unordered_map>

+#include "minddata/dataset/engine/gpu_item_connector.h"
 #include "minddata/dataset/engine/dataset_iterator.h"
 #include "minddata/dataset/util/status.h"
 #include "minddata/dataset/util/task_manager.h"
-
+#ifdef WITH_BACKEND
+#include "mindspore/ccsrc/include/backend/data_queue/data_queue_mgr.h"
+#endif
+#include "mindspore/ccsrc/ps/ps_cache/ps_data/ps_data_prefetch.h"
+#ifdef WITH_BACKEND
+#include "utils/ms_context.h"
+#endif
 namespace mindspore {
 namespace dataset {
+namespace {
+std::vector<DataQueueItem> ConvertTensorRowToDataQueueItem(const TensorRow &row) {
+  std::vector<device::DataQueueItem> items;
+  for (auto &i : row) {
+    device::DataQueueItem data_item;
+    data_item.data_len_ = static_cast<size_t>(i->SizeInBytes());
+    data_item.shapes_ = i->shape().AsVector();
+    data_item.data_ptr_ = const_cast<void *>(static_cast<const void *>(i->GetBuffer()));
+    data_item.data_type_ = i->type().ToString();
+    items.emplace_back(std::move(data_item));
+  }
+  return items;
+}
+}  // namespace
 DeviceQueueOp::DeviceQueueOp(const std::string channel_name, DeviceType device_type, int32_t device_id,
                             bool send_epoch_end, int32_t total_batch, bool create_data_info_queue)
    : PipelineOp(1),
@ -40,37 +61,23 @@ DeviceQueueOp::DeviceQueueOp(const std::string channel_name, DeviceType device_t
      create_data_info_queue_(create_data_info_queue),
      data_info_queue_ptr_(nullptr),
      first_fetch_flag_(false),
-      first_push_flag_(false)
-#ifdef ENABLE_TDTQUE
-      ,
-      ascend_keep_waiting_(true),
-      tdtInstancePtr(std::make_shared<TdtPlugin>(channel_name_, device_id_))
-#endif
-{
+      first_push_flag_(false),
+      ascend_keep_waiting_(true) {
  std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
  dynamic_shape_ = cfg->dynamic_shape();

-#ifdef ENABLE_GPUQUE
-  // Get the total device num of current machine
-  int32_t device_count = 0;
-  cudaGetDeviceCount(&device_count);
-  rank_id_ = cfg->rank_id();  // Get the current rank_id
-  if (device_count > 0) {
-    rank_id_ = rank_id_ % device_count;
-  }
  // Be careful when try to modified these num_workers_ and queue_capacity_,
  // and we suggest num_workers_ * queue_capacity_ not greater than 16, because
  // one worker one circular_pool with 1G pin memory, so num_workers_ * queue_capacity_
  // must limit to avoid memory overload
  num_workers_ = kDeviceQueGpuNumThreads;
  queue_capacity_ = kDeviceQueGpuQueueCapacity;
-#endif
-#ifdef ENABLE_TDTQUE
  ascend_keep_waiting_ = true;
-  if (kIsHeterogeneous) {
-    tdtInstancePtr = std::make_shared<HostQueueImpl>(channel_name_, device_id_);
-  } else {
-    tdtInstancePtr = std::make_shared<TdtPlugin>(channel_name_, device_id_);
+#ifdef WITH_BACKEND
+  MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+  if (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice && !dynamic_shape_) {
+    ascend_data_queue_ =
+      device::DataQueueMgr::GetInstance().CreateDataQueue(kAscendDevice, channel_name, dynamic_shape_, 0, nullptr, {});
  }
 #endif
 #ifdef ENABLE_DUMP_IR
@ -87,13 +94,11 @@ DeviceQueueOp::~DeviceQueueOp() {
 #endif
 }

-#ifdef ENABLE_GPUQUE
 void DeviceQueueOp::ReleaseData(void *addr, int32_t worker_id) {
  if (addr != nullptr && worker_id >= 0 && worker_id < pool_.size()) {
    pool_[worker_id]->Deallocate(addr);
  }
 }
-#endif

 Status DeviceQueueOp::EoeReceived(int32_t worker_id) {
  state_ = OpState::kDeOpIdle;
@ -150,7 +155,7 @@ Status DeviceQueueOp::operator()() {
  }
 #endif
  if (device_type_ == DeviceType::Ascend) {
-#ifdef ENABLE_TDTQUE
+#ifdef WITH_BACKEND
    if (create_data_info_queue_) {
      // This place has a race condition with GetDataInfo, so the first one
      // arrive here will do the initialize work.
@ -162,10 +167,6 @@ Status DeviceQueueOp::operator()() {
        }
      }
    }
-    if (!kIsHeterogeneous && tdtInstancePtr->acl_handle_ == nullptr) {
-      RETURN_STATUS_UNEXPECTED(
-        "[Internal ERROR] Create channel for sending data failed, please check DEVICE ID setting.");
-    }

    if (dynamic_shape_) {
      RETURN_IF_NOT_OK(SendDataToAscendDynamic());
@ -174,7 +175,7 @@ Status DeviceQueueOp::operator()() {
    }
 #endif
  } else if (device_type_ == DeviceType::GPU) {
-#ifdef ENABLE_GPUQUE
+#ifdef WITH_BACKEND
    RETURN_IF_NOT_OK(SendDataToGPU());
 #endif
  } else if (device_type_ == DeviceType::CPU) {
@ -184,7 +185,6 @@ Status DeviceQueueOp::operator()() {
  return Status::OK();
 }

-#ifdef ENABLE_TDTQUE
 Status DeviceQueueOp::SendDataToAscend() {
  MS_LOG(INFO) << "Device queue, sending data to Ascend.";
 #ifndef ENABLE_SECURITY
@ -242,7 +242,7 @@ Status DeviceQueueOp::SendDataToAscend() {
 #endif
      PrintBeginInfoWhenFirstBatch(first_push_flag_);
      // when training stopped, handle might have been destroyed immediately
-      if (!kIsHeterogeneous && tdtInstancePtr->acl_handle_ == nullptr) {
+      if (ascend_data_queue_ != nullptr && !ascend_data_queue_->IsOpen()) {
        MS_LOG(WARNING) << "Thread has already been terminated.";
        is_break_loop = true;
        continue;
@ -306,9 +306,22 @@ Status DeviceQueueOp::SendEpochEndToAscend(const TensorRow &curr_row, const bool
                                           int32_t *tdt_cost, bool *is_break_loop) {
  RETURN_UNEXPECTED_IF_NULL(tdt_cost);
  RETURN_UNEXPECTED_IF_NULL(is_break_loop);
-  if (curr_row.eoe() && send_epoch_end_ && tdtInstancePtr->acl_handle_ != nullptr) {
+  if (curr_row.eoe() && send_epoch_end_ && ascend_data_queue_->IsOpen()) {
    TensorRow dummy_row;
-    auto status = tdtInstancePtr->hostPush(dummy_row, is_profiling_enable, tdt_cost, ACL_TENSOR_DATA_END_OF_SEQUENCE);
+#ifndef ENABLE_SECURITY
+    double start_time = 0;
+    if (is_profiling_enable) {
+      start_time = ProfilingTime::GetCurMilliSecond();
+    }
+#endif
+    auto status = ascend_data_queue_->Push({});
+#ifndef ENABLE_SECURITY
+    if (is_profiling_enable) {
+      double end_time = ProfilingTime::GetCurMilliSecond();
+      // compile error occurring when MS_EXCEPTION_IF_NULL
+      *tdt_cost = static_cast<int32_t>(end_time - start_time);
+    }
+#endif

    RETURN_IF_NOT_OK(CheckPushStatus(status, stop_send_, &send_finished_, is_break_loop));
    MS_LOG(INFO) << "an epoch has already sent, now stop send data.";
@ -337,8 +350,22 @@ void DeviceQueueOp::LimitSendingBatches(int64_t send_batch, int64_t *sending_num
 }

 Status DeviceQueueOp::SendRowToTdt(TensorRow curr_row, bool is_profiling_enable, int32_t *tdt_cost) {
-  auto status = tdtInstancePtr->hostPush(curr_row, is_profiling_enable, tdt_cost);
-  if (status != Status::OK()) {
+  std::vector<device::DataQueueItem> items = ConvertTensorRowToDataQueueItem(curr_row);
+#ifndef ENABLE_SECURITY
+  double start_time = 0;
+  if (is_profiling_enable) {
+    start_time = ProfilingTime::GetCurMilliSecond();
+  }
+#endif
+  auto status = ascend_data_queue_->Push(items);
+#ifndef ENABLE_SECURITY
+  if (is_profiling_enable) {
+    double end_time = ProfilingTime::GetCurMilliSecond();
+    // compile error occurring when MS_EXCEPTION_IF_NULL
+    *tdt_cost = static_cast<int32_t>(end_time - start_time);
+  }
+#endif
+  if (status != device::DataQueueStatus::SUCCESS) {
    if (stop_send_) {
      MS_LOG(INFO) << "stop_send received";
      return Status::OK();
@ -358,15 +385,16 @@ Status DeviceQueueOp::SendRowToTdt(TensorRow curr_row, bool is_profiling_enable,
  return Status::OK();
 }

-Status DeviceQueueOp::CheckPushStatus(Status status, bool stop_send, bool *send_finished, bool *is_break_loop) {
-  if (status != Status::OK()) {
+Status DeviceQueueOp::CheckPushStatus(DataQueueStatus status, bool stop_send, bool *send_finished,
+                                      bool *is_break_loop) {
+  if (status != DataQueueStatus::SUCCESS) {
    if (stop_send) {
      *send_finished = true;
      MS_LOG(INFO) << "stop_send received";
      return Status::OK();
    }
    // when training stopped, handle might have been destroyed immediately
-    if (tdtInstancePtr->acl_handle_ == nullptr) {
+    if (!ascend_data_queue_->IsOpen()) {
      *is_break_loop = true;
      MS_LOG(WARNING) << "Thread has already been terminated.";
      return Status::OK();
@ -379,10 +407,14 @@ Status DeviceQueueOp::CheckPushStatus(Status status, bool stop_send, bool *send_
  }
  return Status::OK();
 }
-#endif

-#ifdef ENABLE_TDTQUE
 Status DeviceQueueOp::GetDataInfo(DATA_INFO *data_info) {
+#ifdef WITH_BACKEND
+  MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+  if (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kAscendDevice) {
+    RETURN_STATUS_UNEXPECTED("'GetDataInfo' only supported on Ascend.");
+  }
+
  if (!create_data_info_queue_) {
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] DataInfo queue is not created.");
  }
@ -396,38 +428,29 @@ Status DeviceQueueOp::GetDataInfo(DATA_INFO *data_info) {
    }
  }
  RETURN_IF_NOT_OK(data_info_queue_ptr_->PopFront(data_info));
+#endif
  return Status::OK();
 }
-#else
-Status DeviceQueueOp::GetDataInfo(DATA_INFO *data_info) {
-  RETURN_STATUS_UNEXPECTED("'GetDataInfo' only supported on Ascend.");
-}
-#endif

-#ifdef ENABLE_GPUQUE
 Status DeviceQueueOp::SetThreadDevice() {
-  // Without cudaSetDevice cuda memory will allocate on GPU:0 as default
-  // and will overload in distribute scenario.
-  auto ret = cudaSetDevice(rank_id_);
-  if (ret != cudaSuccess) {
-    std::string err;
-    err += "cudaSetDevice failed, ret[";
-    err += std::to_string(static_cast<int>(ret));
-    err += "], ";
-    err += cudaGetErrorString(ret);
-    RETURN_STATUS_UNEXPECTED(err);
-  }
+#ifdef WITH_BACKEND
+  (void)device::DataQueueMgr::GetInstance().SetThreadDevice("GPU");
+#endif
  return Status::OK();
 }

 Status DeviceQueueOp::LaunchParallelCopyThread() {
+#ifdef WITH_BACKEND
  RETURN_UNEXPECTED_IF_NULL(tree_);
  // Every thread use cuda api should SetThreadDevice
  RETURN_IF_NOT_OK(SetThreadDevice());
  // CircularPool may not safe under multi-threads scenario, so one worker with one pool
  for (int i = 0; i < num_workers_; i++) {
    std::shared_ptr<MemoryPool> pool;
-    RETURN_IF_NOT_OK(CircularPool::CreateCircularPool(&pool, -1, kDeviceQueGpuThreadMemory, false, true));
+    MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+    RETURN_IF_NOT_OK(CircularPool::CreateCircularPool(
+      &pool, -1, kDeviceQueGpuThreadMemory, false,
+      MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice));
    pool_.push_back(pool);
  }
  gpu_connector_ = std::make_unique<GpuConnector>(num_workers_, 1, queue_capacity_);
@ -437,15 +460,20 @@ Status DeviceQueueOp::LaunchParallelCopyThread() {
    tree_->LaunchWorkers(num_workers_, std::bind(&DeviceQueueOp::WorkerEntry, this, std::placeholders::_1), "", id()));
  RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("Push data to GPU queue",
                                                      std::bind(&DeviceQueueOp::PushDataToGPU, this), nullptr, id()));
-
+#endif
  return Status::OK();
 }

 bool DeviceQueueOp::NoExceptionRaised() {
-  return !TaskManager::FindMe()->Interrupted() && !mindspore::DataQueueHandler::IsClosed();
+#ifdef WITH_BACKEND
+  return !TaskManager::FindMe()->Interrupted() && !device::DataQueueMgr::GetInstance().IsClosed();
+#else
+  return !TaskManager::FindMe()->Interrupted();
+#endif
 }

 Status DeviceQueueOp::PushDataToGPU() {
+#ifdef WITH_BACKEND
  RETURN_UNEXPECTED_IF_NULL(tree_);
  // Every thread use cuda api should SetThreadDevice
  RETURN_IF_NOT_OK(SetThreadDevice());
@ -473,17 +501,17 @@ Status DeviceQueueOp::PushDataToGPU() {
  bool eoe_flag = item.eoe_flag;
  int64_t send_batch = 0;
  auto release_function = std::bind(&DeviceQueueOp::ReleaseData, this, std::placeholders::_1, std::placeholders::_2);
-  BlockQueueStatus_T ret;
+  DataQueueStatus ret;
  if (dynamic_shape_) {
-    ret = mindspore::DataQueueHandler::OpenDynamicBufQueue(channel_name_, release_function);
+    ret = device::DataQueueMgr::GetInstance().OpenDynamicBufQueue(channel_name_, release_function);
  } else {
-    ret = mindspore::DataQueueHandler::Open(channel_name_, release_function);
+    ret = device::DataQueueMgr::GetInstance().Open(channel_name_, release_function);
  }
-  if (ret != BlockQueueStatus_T::SUCCESS) {
+  if (ret != DataQueueStatus::SUCCESS) {
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to open channel for sending data.");
  }

-  while (!(items.empty() && !eoe_flag) && !mindspore::DataQueueHandler::IsClosed()) {
+  while (!(items.empty() && !eoe_flag) && !device::DataQueueMgr::GetInstance().IsClosed()) {
    if (!eoe_flag) {
 #ifdef ENABLE_DUMP_IR
      md_channel_info_->RecordBatchQueue(gpu_connector_->size());
@ -524,8 +552,8 @@ Status DeviceQueueOp::PushDataToGPU() {
      eoe_flag = item.eoe_flag;
      // If the batches send by dataset are more than gpu calculate, gpu will core for no signal notify.
      if (rc.IsError()) {
-        mindspore::DataQueueHandler::Close(channel_name_);
-        mindspore::DataQueueHandler::CloseConfirm();
+        device::DataQueueMgr::GetInstance().Close(channel_name_);
+        device::DataQueueMgr::GetInstance().CloseConfirm();
        return rc;
      }
    } else {
@ -540,8 +568,8 @@ Status DeviceQueueOp::PushDataToGPU() {
  tree_->SetFinished();
  MS_LOG(INFO) << "ExecutionTree finished.  Device queue pushed number of batches: " << send_batch;

-  mindspore::DataQueueHandler::Close(channel_name_);
-  mindspore::DataQueueHandler::CloseConfirm();
+  device::DataQueueMgr::GetInstance().Close(channel_name_);
+  device::DataQueueMgr::GetInstance().CloseConfirm();
  return Status::OK();
 }

@ -553,7 +581,7 @@ Status DeviceQueueOp::WorkerEntry(int32_t worker_id) {
  TensorRow current_row;
  uint32_t batch_num = 0;
  RETURN_IF_NOT_OK(receive_queues_[worker_id]->PopFront(&current_row));
-  while (!current_row.quit() && !mindspore::DataQueueHandler::IsClosed()) {
+  while (!current_row.quit() && !device::DataQueueMgr::GetInstance().IsClosed()) {
    GpuConnectorItem connector_item = {{}, current_row.eoe()};
    if (!connector_item.eoe_flag) {
      std::vector<device::DataQueueItem> items;
@ -565,6 +593,7 @@ Status DeviceQueueOp::WorkerEntry(int32_t worker_id) {
        data_item.worker_id_ = worker_id;
        items.push_back(data_item);
      }
+
      RETURN_IF_NOT_OK(MallocForGPUData(&items, current_row, worker_id));
      connector_item.data_item = std::move(items);
      batch_num++;
@ -579,10 +608,12 @@ Status DeviceQueueOp::WorkerEntry(int32_t worker_id) {
  // Add empty data_item vector with eoe_flag=false as quit flag.
  GpuConnectorItem connector_item = {{}, false};
  RETURN_IF_NOT_OK(gpu_connector_->Add(worker_id, std::move(connector_item)));
+#endif
  return Status::OK();
 }

 Status DeviceQueueOp::SendDataToGPU() {
+#ifdef WITH_BACKEND
  RETURN_IF_NOT_OK(LaunchParallelCopyThread());
  MS_LOG(INFO) << "Device queue, sending data to GPU.";
 #ifndef ENABLE_SECURITY
@ -594,10 +625,8 @@ Status DeviceQueueOp::SendDataToGPU() {
  first_fetch_flag_ = true;
  int64_t num_buf = 0;
  bool is_break_loop = false;
-  uint32_t batch_num = 0;
-  while (!current_row.eof() && !is_break_loop && !mindspore::DataQueueHandler::IsClosed()) {
-    while (!current_row.eoe() && !is_break_loop && !mindspore::DataQueueHandler::IsClosed()) {
-      batch_num++;
+  while (!current_row.eof() && !is_break_loop && !device::DataQueueMgr::GetInstance().IsClosed()) {
+    while (!current_row.eoe() && !is_break_loop && !device::DataQueueMgr::GetInstance().IsClosed()) {
      RETURN_IF_NOT_OK(FilterMetadata(&current_row));
      RETURN_IF_NOT_OK(CheckExceptions(current_row));
 #ifndef ENABLE_SECURITY
@ -636,6 +665,7 @@ Status DeviceQueueOp::SendDataToGPU() {
  }

  MS_LOG(INFO) << "Device queue received number of batches and EOEs: " << (num_buf - num_workers_);
+#endif
  return Status::OK();
 }

@ -664,23 +694,23 @@ Status DeviceQueueOp::MallocForGPUData(std::vector<device::DataQueueItem> *items
 }

 Status DeviceQueueOp::ClearDevice() {
+#ifdef WITH_BACKEND
  MS_LOG(INFO) << "Clearing the data in GPU device: " << device_id_ << " channel: " << channel_name_;
  auto release_function = std::bind(&DeviceQueueOp::ReleaseData, this, std::placeholders::_1, std::placeholders::_2);
-  auto ret = mindspore::DataQueueHandler::Open(channel_name_, release_function);
-  if (ret != BlockQueueStatus_T::SUCCESS) {
+  auto ret = device::DataQueueMgr::GetInstance().Open(channel_name_, release_function);
+  if (ret != DataQueueStatus::SUCCESS) {
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to open channel for clearing the device.");
  }

-  ret = mindspore::DataQueueHandler::Clear(channel_name_);
-  CHECK_FAIL_RETURN_UNEXPECTED(!ret, "Failed to clear the device.");
+  ret = device::DataQueueMgr::GetInstance().Clear(channel_name_);
+  CHECK_FAIL_RETURN_UNEXPECTED(ret == DataQueueStatus::SUCCESS, "Failed to clear the device.");

-  mindspore::DataQueueHandler::Close(channel_name_);
-  mindspore::DataQueueHandler::CloseConfirm();
+  device::DataQueueMgr::GetInstance().Close(channel_name_);
+  device::DataQueueMgr::GetInstance().CloseConfirm();
+#endif
  return Status::OK();
 }

-#endif
-
 Status DeviceQueueOp::SendDataToCPU() {
  MS_LOG(INFO) << "Device queue, sending data to CPU.";
  int64_t total_batch = 0;
@ -794,6 +824,7 @@ void DeviceQueueOp::PrintEndInfoWhenFirstBatch(bool *first_push_flag) {
 }
 Status DeviceQueueOp::RetryPushData(const std::vector<DataQueueItem> &items, const bool profiling,
                                    uint64_t *push_time) {
+#ifdef WITH_BACKEND
  bool flag_log = false;
 #ifndef ENABLE_SECURITY
  uint64_t start_time = 0;
@ -801,10 +832,10 @@ Status DeviceQueueOp::RetryPushData(const std::vector<DataQueueItem> &items, con
    start_time = ProfilingTime::GetCurMilliSecond();
  }
 #endif
-  while (!mindspore::DataQueueHandler::IsClosed() && !TaskManager::FindMe()->Interrupted()) {
-    BlockQueueStatus_T ret = mindspore::DataQueueHandler::Push(channel_name_, items, WAIT_TIME);
-    if (ret != BlockQueueStatus_T::SUCCESS) {
-      if (ret == BlockQueueStatus_T::ERROR_INPUT) {
+  while (!device::DataQueueMgr::GetInstance().IsClosed() && !TaskManager::FindMe()->Interrupted()) {
+    DataQueueStatus ret = device::DataQueueMgr::GetInstance().Push(channel_name_, items, WAIT_TIME);
+    if (ret != DataQueueStatus::SUCCESS) {
+      if (ret == DataQueueStatus::ERROR_INPUT) {
        return Status(
          StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
          "Invalid data, the types or shapes of current row is different with previous row(i.e. do batch operation but "
@ -827,11 +858,13 @@ Status DeviceQueueOp::RetryPushData(const std::vector<DataQueueItem> &items, con
  if (profiling) {
    *push_time = ProfilingTime::GetCurMilliSecond() - start_time;
  }
+#endif
 #endif
  return Status::OK();
 }

 Status DeviceQueueOp::SendDataToAscendDynamic() {
+#ifdef WITH_BACKEND
  MS_LOG(DEBUG) << "Dynamic Device queue, sending data to Ascend.";

  int64_t send_batch = 0;
@ -840,8 +873,8 @@ Status DeviceQueueOp::SendDataToAscendDynamic() {
  bool is_break_loop = false;

  std::function<void(void *, int32_t)> release_function([](void *, int32_t) { return; });
-  auto ret = mindspore::DataQueueHandler::OpenDynamicBufQueue(channel_name_, release_function);
-  if (ret != BlockQueueStatus_T::SUCCESS) {
+  auto ret = device::DataQueueMgr::GetInstance().OpenDynamicBufQueue(channel_name_, release_function);
+  if (ret != DataQueueStatus::SUCCESS) {
    return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
                  "[Internal ERROR] Failed to open channel for sending data.");
  }
@ -854,16 +887,7 @@ Status DeviceQueueOp::SendDataToAscendDynamic() {
    while (!curr_row.eoe() && !is_break_loop) {
      RETURN_IF_NOT_OK(FilterMetadata(&curr_row));
      RETURN_IF_NOT_OK(CheckExceptions(curr_row));
-      std::vector<device::DataQueueItem> items;
-      for (auto &i : curr_row) {
-        device::DataQueueItem data_item;
-        data_item.data_len_ = static_cast<size_t>(i->SizeInBytes());
-        data_item.shapes_ = i->shape().AsVector();
-        data_item.data_ptr_ = const_cast<void *>(static_cast<const void *>(i->GetBuffer()));
-        data_item.data_type_ = i->type().ToString();
-        items.push_back(data_item);
-      }
-
+      std::vector<device::DataQueueItem> items = ConvertTensorRowToDataQueueItem(curr_row);
      RETURN_IF_NOT_OK(RetryPushData(items, false, &data_queue_cost));
      if (create_data_info_queue_) {
        DATA_INFO data_info;
@ -895,9 +919,9 @@ Status DeviceQueueOp::SendDataToAscendDynamic() {
  tree_->SetFinished();
  MS_LOG(INFO) << "ExecutionTree finished. Device queue sent number of batches: " << send_batch;

-  mindspore::DataQueueHandler::Close(channel_name_);
-  mindspore::DataQueueHandler::CloseConfirm();
-
+  device::DataQueueMgr::GetInstance().Close(channel_name_);
+  device::DataQueueMgr::GetInstance().CloseConfirm();
+#endif
  return Status::OK();
 }
 }  // namespace dataset
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
@ -27,30 +27,20 @@

 #include "minddata/dataset/engine/perf/device_queue_tracing.h"
 #include "minddata/dataset/util/status.h"
-#include "mindspore/core/utils/data_queue_handler.h"
 #ifdef ENABLE_DUMP_IR
 #include "minddata/dataset/util/rdr.h"
 #endif
-
-#ifdef ENABLE_TDTQUE
 #include "minddata/dataset/util/queue.h"
-#include "minddata/dataset/engine/device_queue_impl/device_queue_base.h"
-#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_plugin.h"
-#include "minddata/dataset/engine/device_queue_impl/host_queue/host_queue_plugin.h"
-#endif
-
-#ifdef ENABLE_GPUQUE
-#include "minddata/dataset/engine/gpu_item_connector.h"
 #include "minddata/dataset/util/circular_pool.h"
-#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
-#endif
+#include "mindspore/ccsrc/include/backend/data_queue/data_queue.h"

 namespace mindspore {
 namespace dataset {
+class GpuConnector;
 using DATA_INFO = std::vector<std::pair<DataType, TensorShape>>;
 using DATA_INFO_QUEUE = Queue<DATA_INFO>;
-using mindspore::device::BlockQueueStatus_T;
 using mindspore::device::DataQueueItem;
+using mindspore::device::DataQueueStatus;
 constexpr int32_t kTimeOutMilliSeconds = 25000;
 const int kDataInfoQueueCapacity = 128;

@ -83,13 +73,9 @@ class DeviceQueueOp : public PipelineOp {
    stop_send_ = false;
  }

-#ifdef ENABLE_TDTQUE
  void StopWaiting() { ascend_keep_waiting_ = false; }
-#endif

-#ifdef ENABLE_GPUQUE
  Status ClearDevice();
-#endif

  Status GetDataInfo(DATA_INFO *data_info);

@ -136,7 +122,6 @@ class DeviceQueueOp : public PipelineOp {
  bool NoExceptionRaised();
  Status SendDataToAscendDynamic();

-#ifdef ENABLE_TDTQUE
  void WaitContinueSignal() const;
  Status SendDataToAscend();
  Status SendEpochEndToAscend(const TensorRow &curr_row, const bool &is_profiling_enable, int32_t *tdt_cost,
@ -144,11 +129,9 @@ class DeviceQueueOp : public PipelineOp {
  void LimitSendingBatches(int64_t send_batch, int64_t *sending_num, std::shared_ptr<ConfigManager> cfg);
  Status SendRowToTdt(TensorRow curr_row, bool is_profiling_enable, int32_t *tdt_cost);
  // check status that push data into device
-  Status CheckPushStatus(Status status, bool stop_send, bool *send_finished, bool *is_break_loop);
+  Status CheckPushStatus(DataQueueStatus status, bool stop_send, bool *send_finished, bool *is_break_loop);
  bool ascend_keep_waiting_;
-#endif

-#ifdef ENABLE_GPUQUE
  Status SendDataToGPU();
  Status MallocForGPUData(std::vector<device::DataQueueItem> *items, const TensorRow &curr_row,
                          const int32_t &worker_id);
@ -166,11 +149,6 @@ class DeviceQueueOp : public PipelineOp {
  const uint32_t kDeviceQueGpuThreadMemory = 1024;
  uint32_t num_workers_;
  uint32_t queue_capacity_;
-  // This rank_id is for device_queue, one process work with only one rank_id,
-  // for standalone scenario, this rank_id may come from env 'CUDA_VISIBLE_DEVICES',
-  // but for distribute scenario, this rank_id come from _get_global_rank() in python
-  uint32_t rank_id_;
-#endif

  Status SendDataToCPU();
 #ifndef ENABLE_SECURITY
@ -195,10 +173,8 @@ class DeviceQueueOp : public PipelineOp {
  std::mutex data_info_mutex_;
  bool first_push_flag_;  // default: false, when first push, it will be true
  bool dynamic_shape_{false};
+  std::shared_ptr<device::DataQueue> ascend_data_queue_;

-#ifdef ENABLE_TDTQUE
-  std::shared_ptr<DeviceQueueBase> tdtInstancePtr;
-#endif
 #ifdef ENABLE_DUMP_IR
  std::shared_ptr<MDChannelInfo> md_channel_info_;
 #endif
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/CMakeLists.txt
@ -1,11 +0,0 @@
-file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
-set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
-if(ENABLE_TDTQUE)
-    add_subdirectory(tdt)
-    add_subdirectory(host_queue)
-endif()
-add_library(engine-device-queue OBJECT device_queue_base.cc)
-if(ENABLE_TDTQUE)
-    add_dependencies(engine-device-queue engine-tdt)
-    add_dependencies(engine-device-queue engine-host-queue)
-endif()
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/device_queue_base.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/device_queue_base.cc
@ -1,95 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "minddata/dataset/engine/device_queue_impl/device_queue_base.h"
-#include "utils/ms_utils.h"
-#ifndef ENABLE_SECURITY
-#include "minddata/dataset/engine/perf/profiling.h"
-#endif
-#include "minddata/dataset/util/log_adapter.h"
-#if ENABLE_D
-#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
-#endif
-
-namespace mindspore {
-namespace dataset {
-constexpr auto kUnknownErrorString = "Unknown error occurred";
-extern const bool kIsHeterogeneous = []() noexcept -> bool {
-  int32_t is_heterogeneous = 0;
-  (void)rtGetIsHeterogenous(&is_heterogeneous);
-  return is_heterogeneous == 1;
-}();
-
-DeviceQueueBase::DeviceQueueBase(const std::string &channel_name, int32_t device_id) {
-  // init ErrorManager, 0 means success
-  if (ErrorManager::GetInstance().Init() != 0) {
-    MS_LOG(WARNING) << "[Internal Error] Init ErrorManager failed.";
-  }
-  channel_name_ = channel_name;
-  device_id_ = device_id;
-}
-
-Status DeviceQueueBase::GetAclDataType(DataType d_type, aclDataType *datatype) {
-  switch (d_type.value()) {
-    case DataType::DE_BOOL:
-      *datatype = ACL_BOOL;
-      break;
-    case DataType::DE_INT8:
-      *datatype = ACL_INT8;
-      break;
-    case DataType::DE_UINT8:
-      *datatype = ACL_UINT8;
-      break;
-    case DataType::DE_INT16:
-      *datatype = ACL_INT16;
-      break;
-    case DataType::DE_UINT16:
-      *datatype = ACL_UINT16;
-      break;
-    case DataType::DE_INT32:
-      *datatype = ACL_INT32;
-      break;
-    case DataType::DE_UINT32:
-      *datatype = ACL_UINT32;
-      break;
-    case DataType::DE_FLOAT16:
-      *datatype = ACL_FLOAT16;
-      break;
-    case DataType::DE_FLOAT32:
-      *datatype = ACL_FLOAT;
-      break;
-    case DataType::DE_FLOAT64:
-      *datatype = ACL_DOUBLE;
-      break;
-    case DataType::DE_INT64:
-      *datatype = ACL_INT64;
-      break;
-    case DataType::DE_UINT64:
-      *datatype = ACL_UINT64;
-      break;
-    default:
-      RETURN_STATUS_UNEXPECTED("Invalid data, got unexpected data type.");
-  }
-  return Status::OK();
-}
-
-void DeviceQueueBase::ReportErrorMessage() {
-  const std::string &error_message = ErrorManager::GetInstance().GetErrorMessage();
-  if (!error_message.empty() && error_message.find(kUnknownErrorString) == std::string::npos) {
-    MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
-  }
-}
-}  // namespace dataset
-}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/device_queue_base.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/device_queue_base.h
@ -1,65 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_DEVICE_QUEUE_IMPL_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_DEVICE_QUEUE_IMPL_H_
-
-#include <dlfcn.h>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include <thread>
-#include "acl/acl_tdt.h"
-#include "runtime/rt_mem_queue.h"
-#include "runtime/dev.h"
-#include "runtime/config.h"
-#include "graph/def_types.h"
-#include "common/util/error_manager/error_manager.h"
-
-#include "minddata/dataset/core/data_type.h"
-#include "minddata/dataset/core/tensor.h"
-#include "minddata/dataset/core/tensor_row.h"
-#include "minddata/dataset/util/status.h"
-
-namespace mindspore {
-namespace dataset {
-extern const bool kIsHeterogeneous;
-
-class DeviceQueueBase {
- public:
-  virtual Status hostPush(TensorRow ts_row, bool profiling, int32_t *time,
-                          acltdtTensorType tdt_type = ACL_TENSOR_DATA_TENSOR) = 0;
-
-  DeviceQueueBase(const std::string &channel_name, int32_t device_id);
-
-  virtual ~DeviceQueueBase() {}
-
-  acltdtChannelHandle *acl_handle_;
-
- protected:
-  Status GetAclDataType(DataType d_type, aclDataType *datatype);
-
-  void ReportErrorMessage();
-
-  std::string channel_name_;
-
-  int32_t device_id_ = 0;
-};
-}  // namespace dataset
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_DEVICE_QUEUE_IMPL_H_
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/host_queue/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/host_queue/CMakeLists.txt
@ -1,3 +0,0 @@
-file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
-set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
-add_library(engine-host-queue OBJECT host_queue_plugin.cc)
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/host_queue/host_queue_plugin.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/host_queue/host_queue_plugin.cc
@ -1,300 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "minddata/dataset/engine/device_queue_impl/host_queue/host_queue_plugin.h"
-#include "utils/ms_utils.h"
-#ifndef ENABLE_SECURITY
-#include "minddata/dataset/engine/perf/profiling.h"
-#endif
-#include "minddata/dataset/util/log_adapter.h"
-
-namespace mindspore {
-namespace dataset {
-constexpr auto kMbufHeadEndOfSequencePos = 128U;
-constexpr auto kEndOfSequenceFlag = 0x5A;
-constexpr auto kTransIdOffset = 64UL;
-constexpr auto kSleepMilliSeconds = 500;
-const std::map<acltdtTensorType, int32_t> type_map = {
-  {ACL_TENSOR_DATA_TENSOR, 0}, {ACL_TENSOR_DATA_END_OF_SEQUENCE, 1}, {ACL_TENSOR_DATA_ABNORMAL, 2}};
-
-HostQueueImpl::HostQueueImpl(const std::string &channel_name, int32_t device_id)
-    : DeviceQueueBase(channel_name, device_id) {
-  auto status = HostQueueInit();
-  if (status != Status::OK()) {
-    MS_LOG(WARNING) << "Host queue init failed.";
-  }
-}
-
-Status HostQueueImpl::hostPush(TensorRow ts_row, bool profiling, int32_t *time, acltdtTensorType tdt_type) {
-#ifndef ENABLE_SECURITY
-  double start_time;
-  if (profiling) {
-    start_time = ProfilingTime::GetCurMilliSecond();
-  }
-#endif
-  // send data by datagw
-  auto status = SendDataByHostQueue(ts_row, tdt_type);
-  if (status != Status::OK()) {
-    RETURN_STATUS_UNEXPECTED("Host queue send data failed.");
-  }
-#ifndef ENABLE_SECURITY
-  if (profiling) {
-    double end_time = ProfilingTime::GetCurMilliSecond();
-    *time = static_cast<int32_t>(end_time - start_time);
-  }
-#endif
-  return status;
-}
-
-Status HostQueueImpl::HostQueueInit() {
-  auto ret = rtSetDevice(device_id_);
-  if (ret != ACL_RT_SUCCESS) {
-    MS_LOG(ERROR) << "call rtSetDevice failed, ret =" << ret;
-    RETURN_STATUS_UNEXPECTED("Init:rtSetDevice failed.");
-  }
-
-  ret = rtMemQueueInit(device_id_);
-  if (ret != ACL_RT_SUCCESS && ret != ACL_ERROR_RT_REPEATED_INIT) {
-    MS_LOG(ERROR) << "call rtMemQueueInit failed, ret =" << ret;
-    RETURN_STATUS_UNEXPECTED("Init:rtMemQueueInit failed.");
-  }
-
-  rtMemQueueAttr_t attr = {};
-  auto mem_ret = memset_s(attr.name, RT_MQ_MAX_NAME_LEN, 0, RT_MQ_MAX_NAME_LEN);
-  if (mem_ret != EOK) {
-    MS_LOG(ERROR) << "call memset_s failed, ret =" << mem_ret;
-    RETURN_STATUS_UNEXPECTED("Init:memset_s failed.");
-  }
-  mem_ret = memcpy_s(attr.name, RT_MQ_MAX_NAME_LEN, channel_name_.c_str(), channel_name_.size() + 1);
-  if (mem_ret != EOK) {
-    MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
-    RETURN_STATUS_UNEXPECTED("Init:memcpy_s failed.");
-  }
-
-  attr.depth = 128U;
-  attr.workMode = RT_MQ_MODE_DEFAULT;
-  attr.flowCtrlFlag = false;
-  attr.flowCtrlDropTime = 0U;
-  attr.overWriteFlag = false;
-  ret = rtMemQueueCreate(device_id_, &attr, &queue_id_);
-  if (ret != ACL_RT_SUCCESS) {
-    MS_LOG(ERROR) << "call rtMemQueueCreate failed, ret =" << ret;
-    RETURN_STATUS_UNEXPECTED("Init:rtMemQueueCreate failed.");
-  }
-
-  rtMemBuffCfg_t buff_cfg = {0};
-  ret = rtMbufInit(&buff_cfg);
-  if (ret != ACL_RT_SUCCESS && ret != ACL_ERROR_RT_REPEATED_INIT) {
-    MS_LOG(ERROR) << "call rtMbufInit failed, ret =" << ret;
-    RETURN_STATUS_UNEXPECTED("Init:rtMbufInit failed.");
-  }
-  const std::lock_guard<std::mutex> lk(queue_id_to_trans_id_map_mutex);
-  (void)queue_id_to_trans_id_map.insert({queue_id_, 0UL});
-
-  return Status::OK();
-}
-
-Status HostQueueImpl::AddDataItemInfo(const acltdtTensorType &tdt_data_type, const int32_t &tensor_type,
-                                      const int64_t *dims, const size_t &dim_size, void *data_ptr,
-                                      const uint64_t &data_len, std::vector<DataItemInfo> *items) {
-  DataItemInfo item = {};
-  int32_t data_type = 0;
-  auto it = type_map.find(tdt_data_type);
-  if (it != type_map.end()) {
-    data_type = it->second;
-  }
-  item.ctrl_info.data_type = data_type;
-  item.ctrl_info.tensor_type = tensor_type;
-  item.ctrl_info.dim_num = dim_size;
-  item.ctrl_info.data_len = data_len;
-  item.dims.clear();
-  for (size_t i = 0; i < dim_size; ++i) {
-    item.dims.push_back(dims[i]);
-  }
-  item.data_ptr = data_ptr;
-  items->push_back(item);
-  return Status::OK();
-}
-
-Status HostQueueImpl::CreateDataItemInfos(const acltdtTensorType &acl_type, const TensorRow &ts_row,
-                                          std::vector<DataItemInfo> *items) {
-  if (acl_type != ACL_TENSOR_DATA_TENSOR) {
-    return AddDataItemInfo(acl_type, ACL_BOOL, nullptr, 0UL, nullptr, 0UL, items);
-  }
-
-  for (auto &ts : ts_row) {
-    aclDataType data_type;
-    RETURN_IF_NOT_OK(GetAclDataType(ts->type(), &data_type));
-    TensorShape tsShape = ts->shape();
-    std::shared_ptr<void> dataPtr =
-      std::shared_ptr<void>(reinterpret_cast<uchar *>(&(*ts->begin<uint8_t>())), [](const void *elem) {});
-    size_t dataLen = ts->SizeInBytes();
-    const dsize_t dims = tsShape.Rank();
-    std::vector<int64_t> dataShape;
-    for (auto i = 0; i < dims; i++) {
-      dataShape.emplace_back(tsShape[i]);
-    }
-    if (ts->type() != DataType::DE_STRING) {
-      RETURN_IF_NOT_OK(AddDataItemInfo(ACL_TENSOR_DATA_TENSOR, data_type, (tsShape.empty() ? nullptr : &dataShape[0]),
-                                       dims, dataPtr.get(), dataLen, items));
-    } else {
-      RETURN_STATUS_UNEXPECTED("Create data item failed when send data with type:" + std::to_string(data_type));
-    }
-  }
-  return Status::OK();
-}
-
-Status HostQueueImpl::SerializeDataItemInfos(std::vector<DataItemInfo> *items, void **buff,
-                                             const acltdtTensorType &acl_type) {
-  size_t cnt = items->size();
-  size_t total_size = 0UL;
-  for (size_t i = 0UL; i < cnt; ++i) {
-    (*items)[i].ctrl_info.cur_cnt = i;
-    (*items)[i].ctrl_info.cnt = cnt;
-    total_size += sizeof(ItemInfo) + (*items)[i].ctrl_info.dim_num * sizeof(int64_t) + (*items)[i].ctrl_info.data_len;
-  }
-
-  auto rt_error = rtMbufAlloc(buff, total_size);
-  if (rt_error != ACL_RT_SUCCESS) {
-    MS_LOG(ERROR) << "call rtMbufAlloc with size[" << total_size << "] failed, ret = " << rt_error;
-    RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo: rtMbufAlloc failed.");
-  }
-
-  void *data = nullptr;
-  rt_error = rtMbufGetBuffAddr(*buff, &data);
-  if (rt_error != ACL_RT_SUCCESS) {
-    (void)rtMbufFree(*buff);
-    MS_LOG(ERROR) << "call rtMbufGetBuffAddr with size[" << total_size << "] failed, ret = " << rt_error;
-    RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo:rtMbufGetBuffAddr failed.");
-  }
-
-  void *head_buf = nullptr;
-  uint64_t head_size = 0UL;
-  rt_error = rtMbufGetPrivInfo(*buff, &head_buf, &head_size);
-  if (rt_error != ACL_RT_SUCCESS) {
-    (void)rtMbufFree(*buff);
-    MS_LOG(ERROR) << "call rtMbufGetPrivInfo failed, ret =" << rt_error;
-    RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo:rtMbufGetPrivInfo failed.");
-  }
-  if ((head_buf != nullptr) && (head_size > kMbufHeadEndOfSequencePos)) {
-    MS_LOG(DEBUG) << "host queue set end_of_sequence mbuf head.";
-  }
-
-  size_t offset = 0UL;
-  for (size_t i = 0UL; i < cnt; ++i) {
-    auto mem_ret = memcpy_s(ge::ValueToPtr(ge::PtrToValue(data) + offset), sizeof(ItemInfo), &((*items)[i].ctrl_info),
-                            sizeof(ItemInfo));
-    if (mem_ret != EOK) {
-      (void)rtMbufFree(*buff);
-      MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
-      RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo:memcpy_s failed.");
-    }
-    offset += sizeof(ItemInfo);
-
-    for (size_t j = 0UL; j < (*items)[i].ctrl_info.dim_num; ++j) {
-      mem_ret = memcpy_s(ge::ValueToPtr(ge::PtrToValue(data) + offset), sizeof(int64_t), &((*items)[i].dims[j]),
-                         sizeof(int64_t));
-      if (mem_ret != EOK) {
-        (void)rtMbufFree(*buff);
-        MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
-        RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo:memcpy_s failed.");
-      }
-      offset += sizeof(int64_t);
-    }
-
-    if ((*items)[i].ctrl_info.data_len == 0UL) {
-      continue;
-    }
-
-    mem_ret = memcpy_s(ge::ValueToPtr(ge::PtrToValue(data) + offset), (*items)[i].ctrl_info.data_len,
-                       (*items)[i].data_ptr, (*items)[i].ctrl_info.data_len);
-    if (mem_ret != EOK) {
-      (void)rtMbufFree(*buff);
-      MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
-      RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo:memcpy_s failed.");
-    }
-    offset += (*items)[i].ctrl_info.data_len;
-  }
-
-  return Status::OK();
-}
-
-Status HostQueueImpl::LaunchTensor2MBuff(const acltdtTensorType &acl_type, const TensorRow &tensor_row, void **buff) {
-  std::vector<DataItemInfo> items;
-  RETURN_IF_NOT_OK(CreateDataItemInfos(acl_type, tensor_row, &items));
-  RETURN_IF_NOT_OK(SerializeDataItemInfos(&items, buff, acl_type));
-  RETURN_IF_NOT_OK(SetTransId4MBuf(buff));
-  return Status::OK();
-}
-
-Status HostQueueImpl::SetTransId4MBuf(void **buff) {
-  void *head_buff = nullptr;
-  uint64_t head_size = 0UL;
-  auto ret = rtMbufGetPrivInfo(*buff, &head_buff, &head_size);
-  if (ret != ACL_RT_SUCCESS) {
-    MS_LOG(ERROR) << "call rtMbufGetPrivInfo failed, ret =" << ret;
-    RETURN_STATUS_UNEXPECTED("HostQueueSetTransId:rtMbufGetPrivInfo failed.");
-  }
-  uint64_t *trans_id = reinterpret_cast<uint64_t *>(static_cast<uint8_t *>(head_buff) + head_size - kTransIdOffset);
-  const std::lock_guard<std::mutex> lk(queue_id_to_trans_id_map_mutex);
-  *trans_id = ++queue_id_to_trans_id_map[queue_id_];
-  MS_LOG(DEBUG) << "host queue[" << queue_id_ << "] set trans id[" << *trans_id << "] success";
-  return Status::OK();
-}
-
-Status HostQueueImpl::EnqueueData(void *buff, bool *need_resend) {
-  *need_resend = false;
-  auto rt_error = rtSetDevice(device_id_);
-  if (rt_error != ACL_RT_SUCCESS) {
-    MS_LOG(ERROR) << "call rtSetDevice device failed, ret=" << rt_error;
-    RETURN_STATUS_UNEXPECTED("rtSetDevice failed.");
-  }
-  rt_error = rtMemQueueEnQueue(device_id_, queue_id_, buff);
-  if (rt_error == RT_ERROR_NONE) {
-    return Status::OK();
-  } else if (rt_error == ACL_ERROR_RT_QUEUE_FULL) {
-    *need_resend = true;
-    MS_LOG(DEBUG) << "queue[" << queue_id_ << "] is full, need call rtMemQueueEnQueue again";
-  } else {
-    HostQueueFreeBuff(buff);
-    MS_LOG(ERROR) << "host enqueue queue[" << queue_id_ << "] failed, ret = " << rt_error;
-    RETURN_STATUS_UNEXPECTED("host enqueue queue failed.");
-  }
-  return Status::OK();
-}
-
-Status HostQueueImpl::SendDataByHostQueue(const TensorRow &tensor_row, const acltdtTensorType &data_type) {
-  Status status;
-  bool is_need_resend = false;
-  void *buff = nullptr;
-  // Status status;
-  RETURN_IF_NOT_OK(LaunchTensor2MBuff(data_type, tensor_row, &buff));
-  do {
-    if (is_need_resend) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(kSleepMilliSeconds));
-    }
-    status = EnqueueData(buff, &is_need_resend);
-  } while (status == Status::OK() && is_need_resend);
-  return Status::OK();
-}
-
-void HostQueueImpl::HostQueueFreeBuff(void *buff) {
-  auto rt_error = rtMbufFree(buff);
-  if (rt_error != ACL_RT_SUCCESS) {
-    MS_LOG(ERROR) << "call rtMbufFree failed, ret=" << rt_error;
-  }
-}
-}  // namespace dataset
-}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/host_queue/host_queue_plugin.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/host_queue/host_queue_plugin.h
@ -1,91 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_HOST_QUEUE_HOST_QUEUE_PLUGIN_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_HOST_QUEUE_HOST_QUEUE_PLUGIN_H_
-
-#include <dlfcn.h>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include <thread>
-#include "acl/acl_tdt.h"
-#include "runtime/rt_mem_queue.h"
-#include "runtime/dev.h"
-#include "runtime/config.h"
-#include "graph/def_types.h"
-#include "common/util/error_manager/error_manager.h"
-#include "minddata/dataset/engine/device_queue_impl/device_queue_base.h"
-
-#include "minddata/dataset/core/data_type.h"
-#include "minddata/dataset/core/tensor.h"
-#include "minddata/dataset/core/tensor_row.h"
-#include "minddata/dataset/util/status.h"
-
-namespace mindspore {
-namespace dataset {
-struct ItemInfo {
-  int32_t version;
-  int32_t data_type;
-  uint32_t cur_cnt;
-  uint32_t cnt;
-  int32_t tensor_type;
-  uint32_t dim_num;
-  char reserved[32];
-  uint64_t data_len;
-};
-
-struct DataItemInfo {
-  ItemInfo ctrl_info;
-  std::vector<int64_t> dims;
-  void *data_ptr;
-};
-
-class HostQueueImpl : public DeviceQueueBase {
- public:
-  Status hostPush(TensorRow ts_row, bool profiling, int32_t *time, acltdtTensorType tdt_type = ACL_TENSOR_DATA_TENSOR);
-
-  HostQueueImpl(const std::string &channel_name, int32_t device_id);
-
-  ~HostQueueImpl() {}
-
- private:
-  Status HostQueueInit();
-
-  Status SendDataByHostQueue(const TensorRow &tensor_row, const acltdtTensorType &data_type);
-
-  Status SetTransId4MBuf(void **buff);
-
-  Status LaunchTensor2MBuff(const acltdtTensorType &acl_type, const TensorRow &tensor_row, void **buff);
-  Status EnqueueData(void *buff, bool *need_resend);
-
-  Status CreateDataItemInfos(const acltdtTensorType &acl_type, const TensorRow &ts_row,
-                             std::vector<DataItemInfo> *items);
-  Status SerializeDataItemInfos(std::vector<DataItemInfo> *items, void **buff, const acltdtTensorType &acl_type);
-  Status AddDataItemInfo(const acltdtTensorType &tdt_data_type, const int32_t &tensor_type, const int64_t *dims,
-                         const size_t &dim_size, void *data_ptr, const uint64_t &data_len,
-                         std::vector<DataItemInfo> *items);
-  void HostQueueFreeBuff(void *buff);
-
-  std::mutex queue_id_to_trans_id_map_mutex;
-  std::map<uint32_t, uint64_t> queue_id_to_trans_id_map;
-  uint32_t queue_id_ = 0;
-};
-}  // namespace dataset
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_HOST_QUEUE_HOST_QUEUE_PLUGIN_H_
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/tdt/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/tdt/CMakeLists.txt
@ -1,3 +0,0 @@
-file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
-set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
-add_library(engine-tdt OBJECT tdt_plugin.cc tdt_handle.cc)
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.cc
@ -1,20 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h"
-
-// Implementation code for TdtHandle moved from 'tdt_handle.cc' to 'tdt_handle.h';
-// This is a workaround for cyclic dependency problem between dataset and core
-// when ENABLE_TDTQUE is enabled.
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h
@ -1,84 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_HANDLE_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_HANDLE_H_
-
-#include <iostream>
-#include <map>
-#include <thread>
-#include <utility>
-#include "minddata/dataset/util/log_adapter.h"
-#include "acl/acl_tdt.h"
-
-namespace mindspore {
-namespace dataset {
-class TdtHandle {
- public:
-  static inline void AddHandle(acltdtChannelHandle **handle, std::thread *use_thread);
-
-  static inline bool DestroyHandle();
-
-  static inline void DelHandle(acltdtChannelHandle **handle);
-
- private:
-  TdtHandle() {}
-  ~TdtHandle() = default;
-};
-
-inline void TdtHandle::AddHandle(acltdtChannelHandle **handle, std::thread *use_thread) {
-  if (*handle != nullptr) {
-    auto ret = acl_handle_map.emplace(reinterpret_cast<void **>(handle), use_thread);
-    if (!std::get<1>(ret)) {
-      MS_LOG(ERROR) << "Failed to add new handle to acl_handle_map." << std::endl;
-    }
-  }
-}
-
-inline void TdtHandle::DelHandle(acltdtChannelHandle **handle) {
-  void **void_handle = reinterpret_cast<void **>(handle);
-  acl_handle_map.erase(void_handle);
-}
-
-inline bool TdtHandle::DestroyHandle() {
-  bool destroy_all = true;
-  for (auto &item : acl_handle_map) {
-    acltdtChannelHandle **handle = reinterpret_cast<acltdtChannelHandle **>(item.first);
-    if (*handle != nullptr) {
-      aclError stop_status = acltdtStopChannel(*handle);
-      if (stop_status != ACL_SUCCESS) {
-        MS_LOG(ERROR) << "Failed stop acl data channel and the stop status is " << stop_status << std::endl;
-        return false;
-      }
-      if (item.second != nullptr && item.second->joinable()) {
-        item.second->join();
-      }
-      if (acltdtDestroyChannel(*handle) != ACL_SUCCESS) {
-        destroy_all = false;
-      } else {
-        *handle = nullptr;
-      }
-    }
-  }
-
-  // clear the map container when all the handle has been destroyed
-  if (destroy_all) {
-    acl_handle_map.clear();
-  }
-  return destroy_all;
-}
-}  // namespace dataset
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_HANDLE_H_
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/tdt/tdt_plugin.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/tdt/tdt_plugin.cc
@ -1,205 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_plugin.h"
-#include "utils/ms_utils.h"
-#ifndef ENABLE_SECURITY
-#include "minddata/dataset/engine/perf/profiling.h"
-#endif
-#include "minddata/dataset/util/log_adapter.h"
-#include "minddata/dataset/util/task_manager.h"
-#if ENABLE_D
-#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
-#endif
-
-namespace mindspore {
-namespace dataset {
-TdtPlugin::TdtPlugin(const std::string &channel_name, int32_t device_id) : DeviceQueueBase(channel_name, device_id) {
-  // init ErrorManager, 0 means success
-  if (ErrorManager::GetInstance().Init() != 0) {
-    MS_LOG(WARNING) << "[Internal Error] Init ErrorManager failed.";
-  }
-  // create acl tdt handle
-  acl_handle_ = acltdtCreateChannel(device_id, channel_name.c_str());
-  if (acl_handle_ == nullptr) {
-    MS_LOG(ERROR) << "Create channel for sending data failed, please check DEVICE ID setting, DEVICE ID that passed "
-                     "into dataset(from context) and training process should be the same.";
-    ReportErrorMessage();
-  }
-  TdtHandle::AddHandle(&acl_handle_, nullptr);
-}
-
-TdtPlugin::~TdtPlugin() {
-  if (acl_handle_ != nullptr) {
-    if (acltdtDestroyChannel(acl_handle_) != ACL_SUCCESS) {
-      MS_LOG(ERROR) << "Failed to destroy channel for tdt queue.";
-      ReportErrorMessage();
-    } else {
-      TdtHandle::DelHandle(&acl_handle_);
-      acl_handle_ = nullptr;
-    }
-  }
-}
-
-Status TdtPlugin::hostPush(TensorRow ts_row, bool profiling, int32_t *time, acltdtTensorType tdt_type) {
-  MS_LOG(DEBUG) << "TDT channel name is " << channel_name_ << ".";
-  acltdtDataset *acl_dataset = nullptr;
-#ifndef ENABLE_SECURITY
-  double start_time;
-#endif
-  auto ret = translate(tdt_type, ts_row, &acl_dataset);
-  if (ret != Status::OK()) {
-    DestroyAclDataset(acl_dataset);
-    RETURN_STATUS_UNEXPECTED("Converting into TDT tensor failed!");
-  }
-#ifndef ENABLE_SECURITY
-  if (profiling) {
-    start_time = ProfilingTime::GetCurMilliSecond();
-  }
-#endif
-
-#if ENABLE_D
-  // Data prefetch only when PS mode enables cache.
-  if (acltdtGetDatasetSize(acl_dataset) > 0) {
-    acltdtDataItem *item0 = acltdtGetDataItem(acl_dataset, 0);
-    std::string item_type;
-    RETURN_IF_NOT_OK(ParseType(acltdtGetDataTypeFromItem(item0), &item_type));
-    if (!ps::PsDataPrefetch::GetInstance().PrefetchData(channel_name_, acltdtGetDataAddrFromItem(item0),
-                                                        acltdtGetDataSizeFromItem(item0), item_type)) {
-      RETURN_STATUS_UNEXPECTED("PrefetchData failed in when pre-processing sending data.");
-    }
-  }
-#endif
-  auto status = acltdtSendTensor(acl_handle_, acl_dataset, -1);
-  DestroyAclDataset(acl_dataset);
-  if (status != ACL_SUCCESS) {
-    // if the device_queue thread had been interrupted by master, just print warning and return success
-    if (mindspore::dataset::this_thread::is_interrupted()) {
-      MS_LOG(WARNING) << "Device queue thread had been interrupted by TdtHandle::DestroyHandle, you can ignore "
-                      << "the above error: 'failed to send...'. In this scenario, the training ends first without "
-                      << "using all epoch(s) data, and the data preprocessing is blocked by the data "
-                      << "transmission channel on the device side. So we force the data transmission channel to stop.";
-      return Status::OK();
-    }
-    ReportErrorMessage();
-    RETURN_STATUS_UNEXPECTED("Tdt Send data failed.");
-  }
-#ifndef ENABLE_SECURITY
-  if (profiling) {
-    double end_time = ProfilingTime::GetCurMilliSecond();
-    *time = static_cast<int32_t>(end_time - start_time);
-  }
-#endif
-  return Status::OK();
-}
-
-Status TdtPlugin::ParseType(const aclDataType &acl_data_type, std::string *data_type) {
-  auto type_iter = parse_map.find(acl_data_type);
-  if (type_iter == parse_map.end()) {
-    RETURN_STATUS_UNEXPECTED("Got unsupported acl datatype: " + std::to_string(acl_data_type));
-  }
-  *data_type = type_iter->second;
-  return Status::OK();
-}
-
-Status TdtPlugin::translate(acltdtTensorType tdt_type, const TensorRow &ts_row, acltdtDataset **output_acl_dataset) {
-  auto acl_dataset = acltdtCreateDataset();
-  if (acl_dataset == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Create tdt dataset failed.");
-  }
-  auto status = AssembleTensor2AclDataset(tdt_type, ts_row, acl_dataset);
-  if (status != Status::OK()) {
-    DestroyAclDataset(acl_dataset);
-    RETURN_STATUS_UNEXPECTED("Assemble tensor row to tdt dataset failed.");
-  }
-
-  *output_acl_dataset = acl_dataset;
-  return Status::OK();
-}
-
-Status TdtPlugin::AssembleTensor2AclDataset(acltdtTensorType tdt_type, const TensorRow &ts_row,
-                                            acltdtDataset *acl_dataset) {
-  if (tdt_type != ACL_TENSOR_DATA_TENSOR || ts_row.size() == 0) {
-    acltdtDataItem *acl_data = acltdtCreateDataItem(tdt_type, nullptr, 0, ACL_BOOL, nullptr, 0);
-    if (acl_data == nullptr) {
-      RETURN_STATUS_UNEXPECTED("Create data item failed when send data with type:" + std::to_string(tdt_type));
-    }
-    if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_SUCCESS) {
-      if (acltdtDestroyDataItem(acl_data) != ACL_SUCCESS) {
-        MS_LOG(ERROR) << "Destroy data item failed when send data with type: " << tdt_type;
-      }
-      RETURN_STATUS_UNEXPECTED("Add data item to tdt dataset failed when send data.");
-    }
-    return Status::OK();
-  }
-
-  for (auto ts : ts_row) {
-    aclDataType datatype;
-    acltdtDataItem *acl_data = nullptr;
-    RETURN_IF_NOT_OK(GetAclDataType(ts->type(), &datatype));
-
-    TensorShape tsShape = ts->shape();
-    std::string dataShapes = "[";
-    for (auto dim : tsShape.AsVector()) {
-      (void)dataShapes.append(std::to_string(dim)).append(",");
-    }
-    dataShapes.pop_back();
-    (void)dataShapes.append("]");
-
-    std::shared_ptr<void> dataPtr =
-      std::shared_ptr<void>(reinterpret_cast<uchar *>(&(*ts->begin<uint8_t>())), [](const void *elem) {});
-    size_t dataLen = ts->SizeInBytes();
-    const dsize_t dims = tsShape.Rank();
-    std::vector<int64_t> dataShape;
-    for (auto i = 0; i < dims; i++) {
-      dataShape.emplace_back(tsShape[i]);
-    }
-    acl_data = acltdtCreateDataItem(ACL_TENSOR_DATA_TENSOR, (tsShape.empty() ? nullptr : &dataShape[0]), dims, datatype,
-                                    dataPtr.get(), dataLen);
-    if (acl_data == nullptr) {
-      RETURN_STATUS_UNEXPECTED("Create data item failed when send data.");
-    }
-    if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_SUCCESS) {
-      if (acltdtDestroyDataItem(acl_data) != ACL_SUCCESS) {
-        MS_LOG(ERROR) << "Destroy data item failed when send data with type ACL_TENSOR_DATA_TENSOR.";
-      }
-      RETURN_STATUS_UNEXPECTED("Add data item to tdt dataset failed when send data.");
-    }
-
-    MS_LOG(DEBUG) << "TDT data type is TDT_TENSOR, tensor type is " << datatype << ", tensor shape is " << dataShapes
-                  << ", data length is " << ts->Size() << ".";
-  }
-
-  return Status::OK();
-}
-
-Status TdtPlugin::DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item) {
-  if (include_data_item) {
-    for (size_t i = 0; i < acltdtGetDatasetSize(acl_dataset); i++) {
-      if (acltdtDestroyDataItem(acltdtGetDataItem(acl_dataset, i)) != ACL_SUCCESS) {
-        ReportErrorMessage();
-        RETURN_STATUS_UNEXPECTED("Destroy data item failed when send data.");
-      }
-    }
-  }
-
-  if (acltdtDestroyDataset(acl_dataset) != ACL_SUCCESS) {
-    ReportErrorMessage();
-    RETURN_STATUS_UNEXPECTED("Destroy tdt dataset failed when send data.");
-  }
-  return Status::OK();
-}
-}  // namespace dataset
-}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/tdt/tdt_plugin.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/device_queue_impl/tdt/tdt_plugin.h
@ -1,71 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_PLUGIN_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_PLUGIN_H_
-
-#include <dlfcn.h>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include <thread>
-#include "acl/acl_tdt.h"
-#include "runtime/rt_mem_queue.h"
-#include "runtime/dev.h"
-#include "runtime/config.h"
-#include "graph/def_types.h"
-#include "common/util/error_manager/error_manager.h"
-#include "minddata/dataset/engine/device_queue_impl/device_queue_base.h"
-#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h"
-
-#include "minddata/dataset/core/data_type.h"
-#include "minddata/dataset/core/tensor.h"
-#include "minddata/dataset/core/tensor_row.h"
-#include "minddata/dataset/util/status.h"
-
-namespace mindspore {
-namespace dataset {
-class TdtPlugin : public DeviceQueueBase {
- public:
-  static std::shared_ptr<TdtPlugin> GetInstance();
-
-  Status hostPush(TensorRow ts_row, bool profiling, int32_t *time, acltdtTensorType tdt_type = ACL_TENSOR_DATA_TENSOR);
-
-  TdtPlugin(const std::string &channel_name, int32_t device_id);
-
-  ~TdtPlugin();
-
- private:
-  Status DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item = true);
-
-  Status AssembleTensor2AclDataset(acltdtTensorType tdt_type, const TensorRow &ts_row, acltdtDataset *acl_dataset);
-
-  Status ParseType(const aclDataType &acl_data_type, std::string *data_type);
-
-  Status translate(acltdtTensorType tdt_type, const TensorRow &ts_row, acltdtDataset **output_acl_dataset);
-
-  void *tdt_handle_ = nullptr;
-
-  const std::map<aclDataType, std::string> parse_map = {
-    {ACL_INT8, "int8"},       {ACL_UINT8, "uint8"},   {ACL_INT16, "int16"},    {ACL_UINT16, "uint16"},
-    {ACL_INT32, "int32"},     {ACL_UINT32, "uint32"}, {ACL_INT64, "int64"},    {ACL_UINT64, "uint64"},
-    {ACL_FLOAT16, "float16"}, {ACL_FLOAT, "float32"}, {ACL_DOUBLE, "float64"}, {ACL_BOOL, "bool"}};
-};
-}  // namespace dataset
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_PLUGIN_H_
--- a/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc
@ -19,7 +19,7 @@
 #include <limits>
 #include "minddata/dataset/engine/datasetops/dataset_op.h"
 #include "minddata/dataset/engine/datasetops/device_queue_op.h"
-#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
+#ifdef WITH_BACKEND
 #include "mindspore/core/utils/numa_interface.h"
 #endif
 #include "minddata/dataset/util/task_manager.h"
@ -28,7 +28,7 @@
 namespace mindspore {
 namespace dataset {
 // Constructor
-#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
+#ifdef WITH_BACKEND
 ExecutionTree::ExecutionTree() : ExecutionTree(GlobalContext::config_manager()) {}

 ExecutionTree::ExecutionTree(std::shared_ptr<ConfigManager> cfg)
@ -52,7 +52,7 @@ ExecutionTree::ExecutionTree() : id_count_(0), tree_state_(kDeTStateInit), prepa

 // Destructor
 ExecutionTree::~ExecutionTree() {
-#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
+#ifdef WITH_BACKEND
  if (numa_enable_) {
    handle_ = nullptr;
  }
@ -152,7 +152,7 @@ void ExecutionTree::PrintNode(std::ostream &out, const std::shared_ptr<DatasetOp
 Status ExecutionTree::Launch() {
  // opencv limit too many threads
 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__) && !defined(ENABLE_ANDROID)
-#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
+#ifdef WITH_BACKEND
  // Here we do numa bind for performance optimization, as our test result,
  // if we do numa bind when get_dataset_size launch a tree, we'll get a
  // better performance than only we do numa bind at the time _To_Device
--- a/mindspore/ccsrc/minddata/dataset/engine/execution_tree.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/execution_tree.h
@ -229,7 +229,7 @@ class ExecutionTree {
  TreeState tree_state_;             // Tracking the current tree state
  std::string unique_id_;            // A unique identifier for the tree

-#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
+#ifdef WITH_BACKEND
  // Constructor for if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
  explicit ExecutionTree(std::shared_ptr<ConfigManager> cfg);
  // This rank_id is for numa and device_queue, one process work with only one rank_id,
--- a/mindspore/ccsrc/minddata/dataset/engine/gpu_item_connector.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/gpu_item_connector.h
@ -16,7 +16,6 @@
 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_GPU_ITEM_CONNECTOR_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_GPU_ITEM_CONNECTOR_H_

-#ifdef ENABLE_GPUQUE
 #include <memory>
 #include <string>
 #include <utility>
@ -24,7 +23,7 @@
 #include "minddata/dataset/engine/connector.h"
 #include "minddata/dataset/util/status.h"
 #include "minddata/dataset/include/dataset/constants.h"
-#include "runtime/data_queue/blocking_queue.h"
+#include "include/backend/data_queue/blocking_queue.h"

 using mindspore::device::DataQueueItem;

@ -88,5 +87,4 @@ class GpuConnector : public Connector<GpuConnectorItem> {
 };
 }  // namespace dataset
 }  // namespace mindspore
-#endif  // ENABLE_GPUQUE
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_GPU_ITEM_CONNECTOR_H_
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc
@ -20,17 +20,17 @@
 #include <algorithm>
 #include "utils/ms_utils.h"
 #include "minddata/dataset/util/path.h"
-#ifdef ENABLE_GPUQUE
 #include "minddata/dataset/core/config_manager.h"
 #include "minddata/dataset/core/global_context.h"
-#endif
 #include "minddata/dataset/engine/perf/monitor.h"
 #include "minddata/dataset/engine/perf/connector_size.h"
 #include "minddata/dataset/engine/perf/cpu_sampler.h"
 #include "minddata/dataset/engine/execution_tree.h"
 #include "minddata/dataset/engine/tree_adapter.h"
 #include "minddata/dataset/util/log_adapter.h"
-
+#ifdef WITH_BACKEND
+#include "utils/ms_context.h"
+#endif
 namespace mindspore {
 namespace dataset {

@ -824,18 +824,19 @@ ProfilingManager::ProfilingRegistrationState ProfilingManager::GetProfilerTreeSt
 }

 std::string ProfilingManager::GetRankID() const {
-  std::string rank_id;
-#ifdef ENABLE_GPUQUE
-  std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
-  int32_t rank_id_int = cfg->rank_id();
-  // If DEVICE_ID is not set, default value is 0
-  if (rank_id_int < 0) {
-    rank_id = common::GetEnv("DEVICE_ID");
-  } else {
-    rank_id = std::to_string(rank_id_int);
+  std::string rank_id = common::GetEnv("RANK_ID");
+#ifdef WITH_BACKEND
+  MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+  if (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
+    std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
+    int32_t rank_id_int = cfg->rank_id();
+    // If DEVICE_ID is not set, default value is 0
+    if (rank_id_int < 0) {
+      rank_id = common::GetEnv("DEVICE_ID");
+    } else {
+      rank_id = std::to_string(rank_id_int);
+    }
  }
-#else
-  rank_id = common::GetEnv("RANK_ID");
 #endif
  // If RANK_ID is not set, default value is 0
  if (rank_id.empty()) {
--- a/mindspore/ccsrc/minddata/dataset/util/arena.cc
+++ b/mindspore/ccsrc/minddata/dataset/util/arena.cc
@ -18,7 +18,9 @@
 #include <utility>
 #include "minddata/dataset/util/log_adapter.h"
 #include "minddata/dataset/util/system_pool.h"
-
+#ifdef WITH_BACKEND
+#include "mindspore/ccsrc/include/backend/data_queue/data_queue_mgr.h"
+#endif
 namespace mindspore {
 namespace dataset {
 struct MemHdr {
@ -243,29 +245,22 @@ std::ostream &operator<<(std::ostream &os, const ArenaImpl &s) {
 Status Arena::Init() {
  try {
    int64_t sz = size_in_MB_ * 1048576L;
-#ifdef ENABLE_GPUQUE
+#ifdef WITH_BACKEND
    if (is_cuda_malloc_) {
-      auto ret = cudaHostAlloc(&ptr_, sz, cudaHostAllocDefault);
-      if (ret != cudaSuccess) {
-        MS_LOG(ERROR) << "cudaHostAlloc failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
-        return Status(StatusCode::kMDOutOfMemory);
-      }
-      impl_ = std::make_unique<ArenaImpl>(ptr_, sz);
+      ptr_ = device::DataQueueMgr::GetInstance().AllocHostMem("GPU", sz);
    } else {
-      RETURN_IF_NOT_OK(DeMalloc(sz, &ptr_, false));
-      impl_ = std::make_unique<ArenaImpl>(ptr_, sz);
+      ptr_ = device::DataQueueMgr::GetInstance().AllocHostMem("Others", sz);
    }
 #else
-    RETURN_IF_NOT_OK(DeMalloc(sz, &ptr_, false));
-    impl_ = std::make_unique<ArenaImpl>(ptr_, sz);
+    ptr_ = std::shared_ptr<void>(::malloc(sz), ::free);
 #endif
+    impl_ = std::make_unique<ArenaImpl>(ptr_.get(), sz);
  } catch (std::bad_alloc &e) {
    return Status(StatusCode::kMDOutOfMemory);
  }
  return Status::OK();
 }

-#ifdef ENABLE_GPUQUE
 Arena::Arena(size_t val_in_MB, bool is_cuda_malloc)
    : ptr_(nullptr), size_in_MB_(val_in_MB), is_cuda_malloc_(is_cuda_malloc) {}

@ -279,19 +274,5 @@ Status Arena::CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB, bool i
  RETURN_IF_NOT_OK(ba->Init());
  return Status::OK();
 }
-#else
-Arena::Arena(size_t val_in_MB) : ptr_(nullptr), size_in_MB_(val_in_MB) {}
-
-Status Arena::CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB) {
-  RETURN_UNEXPECTED_IF_NULL(p_ba);
-  auto ba = new (std::nothrow) Arena(val_in_MB);
-  if (ba == nullptr) {
-    return Status(StatusCode::kMDOutOfMemory);
-  }
-  (*p_ba).reset(ba);
-  RETURN_IF_NOT_OK(ba->Init());
-  return Status::OK();
-}
-#endif
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/util/arena.h
+++ b/mindspore/ccsrc/minddata/dataset/util/arena.h
@ -22,9 +22,6 @@
 #include "minddata/dataset/util/allocator.h"
 #include "minddata/dataset/util/memory_pool.h"
 #include "minddata/dataset/util/treap.h"
-#ifdef ENABLE_GPUQUE
-#include <cuda_runtime_api.h>
-#endif

 #define ARENA_LOG_BLK_SZ (6u)
 #define ARENA_BLK_SZ (static_cast<uint16_t>(1u << ARENA_LOG_BLK_SZ))
@ -107,20 +104,7 @@ class Arena : public MemoryPool {
  // Disable copy and assignment constructor
  Arena(const Arena &) = delete;
  Arena &operator=(const Arena &) = delete;
-  ~Arena() override {
-#ifdef ENABLE_GPUQUE
-    if (is_cuda_malloc_) {
-      if (ptr_ != nullptr) {
-        (void)cudaFreeHost(ptr_);
-      }
-    }
-#else
-    if (ptr_ != nullptr) {
-      free(ptr_);
-    }
-    ptr_ = nullptr;
-#endif
-  }
+  ~Arena() override = default;

  /// As a derived class of MemoryPool, we have to implement the following.
  /// But we simply transfer the call to the implementation class
@ -150,28 +134,17 @@ class Arena : public MemoryPool {
    os << *(s.impl_);
    return os;
  }
-
-#ifdef ENABLE_GPUQUE
  /// The only method to create an arena.
  static Status CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB = 4096, bool is_cuda_malloc = false);
-#else
-  /// The only method to create an arena.
-  static Status CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB = 4096);
-#endif

 protected:
  mutable std::mutex mux_;
  std::unique_ptr<ArenaImpl> impl_;
-  void *ptr_;
+  std::shared_ptr<void> ptr_;
  size_t size_in_MB_;
-#ifdef ENABLE_GPUQUE
+
  bool is_cuda_malloc_;
-
  explicit Arena(size_t val_in_MB = 4096, bool is_cuda_malloc = false);
-#else
-
-  explicit Arena(size_t val_in_MB = 4096);
-#endif

  Status Init();
 };
--- a/mindspore/ccsrc/minddata/dataset/util/circular_pool.cc
+++ b/mindspore/ccsrc/minddata/dataset/util/circular_pool.cc
@ -27,11 +27,7 @@ namespace dataset {
 Status CircularPool::AddOneArena() {
  Status rc;
  std::shared_ptr<Arena> b;
-#ifdef ENABLE_GPUQUE
  RETURN_IF_NOT_OK(Arena::CreateArena(&b, arena_size_, is_cuda_malloc_));
-#else
-  RETURN_IF_NOT_OK(Arena::CreateArena(&b, arena_size_));
-#endif
  tail_ = b.get();
  cur_size_in_mb_ += arena_size_;
  mem_segments_.push_back(std::move(b));
@ -198,22 +194,13 @@ int CircularPool::PercentFree() const {
  }
 }

-#ifdef ENABLE_GPUQUE
 CircularPool::CircularPool(int max_size_in_gb, int arena_size, bool is_cuda_malloc)
    : unlimited_(max_size_in_gb <= 0),
      max_size_in_mb_(unlimited_ ? std::numeric_limits<int32_t>::max() : max_size_in_gb * 1024),
      arena_size_(arena_size),
      is_cuda_malloc_(is_cuda_malloc),
      cur_size_in_mb_(0) {}
-#else
-CircularPool::CircularPool(int max_size_in_gb, int arena_size)
-    : unlimited_(max_size_in_gb <= 0),
-      max_size_in_mb_(unlimited_ ? std::numeric_limits<int32_t>::max() : max_size_in_gb * 1024),
-      arena_size_(arena_size),
-      cur_size_in_mb_(0) {}
-#endif

-#ifdef ENABLE_GPUQUE
 Status CircularPool::CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb, int arena_size,
                                        bool createOneArena, bool is_cuda_malloc) {
  Status rc;
@ -234,28 +221,6 @@ Status CircularPool::CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, i
  }
  return rc;
 }
-#else
-Status CircularPool::CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb, int arena_size,
-                                        bool createOneArena) {
-  Status rc;
-  if (out_pool == nullptr) {
-    RETURN_STATUS_UNEXPECTED("pPool is null");
-  }
-  auto pool = new (std::nothrow) CircularPool(max_size_in_gb, arena_size);
-  if (pool == nullptr) {
-    RETURN_STATUS_OOM("Out of memory.");
-  }
-  if (createOneArena) {
-    rc = pool->AddOneArena();
-  }
-  if (rc.IsOk()) {
-    (*out_pool).reset(pool);
-  } else {
-    delete pool;
-  }
-  return rc;
-}
-#endif

 CircularPool::~CircularPool() = default;
 }  // namespace dataset
--- a/mindspore/ccsrc/minddata/dataset/util/circular_pool.h
+++ b/mindspore/ccsrc/minddata/dataset/util/circular_pool.h
@ -85,13 +85,8 @@ class CircularPool : public MemoryPool {
    return os;
  }

-#ifdef ENABLE_GPUQUE
  static Status CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb = -1,
                                   int arena_size = 4096, bool create_one_arena = false, bool is_cuda_malloc = false);
-#else
-  static Status CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb = -1,
-                                   int arena_size = 4096, bool create_one_arena = false);
-#endif

 private:
  ListOfArenas mem_segments_;
@ -101,16 +96,10 @@ class CircularPool : public MemoryPool {
  int arena_size_;
  int cur_size_in_mb_;
  RWLock rw_lock_;
-#ifdef ENABLE_GPU
  bool is_cuda_malloc_;

  // We can take negative or 0 as input which means unlimited.
  CircularPool(int max_size_in_gb, int arena_size, bool is_cuda_malloc);
-#else
-
-  // We can take negative or 0 as input which means unlimited.
-  CircularPool(int max_size_in_gb, int arena_size);
-#endif

  Status AddOneArena();
 };
--- a/mindspore/ccsrc/minddata/dataset/util/task.cc
+++ b/mindspore/ccsrc/minddata/dataset/util/task.cc
@ -22,12 +22,10 @@
 #if defined(__ANDROID__) || defined(ANDROID)
 #include "minddata/dataset/util/services.h"
 #endif
-#ifdef ENABLE_TDTQUE
-#include "acl/acl_tdt.h"
-#include "tdt/status.h"
-#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h"
+#ifdef WITH_BACKEND
+#include "utils/ms_context.h"
+#include "mindspore/ccsrc/include/backend/data_queue/data_queue_mgr.h"
 #endif
-
 namespace mindspore {
 namespace dataset {
 thread_local Task *gMyTask = nullptr;
@ -141,6 +139,10 @@ Status Task::Run() {
 }

 Status Task::Join(WaitFlag blocking) {
+#ifdef WITH_BACKEND
+  MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+  std::string device_target = MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+#endif
  if (running_) {
    RETURN_UNEXPECTED_IF_NULL(MyTaskGroup());
    auto interrupt_svc = MyTaskGroup()->GetIntrpService();
@ -162,23 +164,27 @@ Status Task::Join(WaitFlag blocking) {
          MS_LOG(WARNING) << MyName() << " Thread ID " << ss.str() << " is not responding. Interrupt again";
          interrupt_svc->InterruptAll();
          wait_times++;
-#ifdef ENABLE_TDTQUE
-          // Because hostPush hung in DeviceQueueOp, wait 5 seconds and destroy the tdt
-          if (wait_times > 5 && my_name_.find("DeviceQueueOp") != std::string::npos) {
-            MS_LOG(WARNING) << "Wait " << wait_times << " seconds, "
-                            << "the task: " << my_name_ << " will be destroyed by TdtHostDestory.";
-            if (!TdtHandle::DestroyHandle()) {
-              MS_LOG(WARNING) << "Destroy tdt channel failed.";
-            } else {
-              MS_LOG(INFO) << "Destroy tdt channel success.";
-            }
+#ifdef WITH_BACKEND
+          if (device_target == kAscendDevice) {
+            // Because hostPush hung in DeviceQueueOp, wait 5 seconds and destroy the tdt
+            if (wait_times > 5 && my_name_.find("DeviceQueueOp") != std::string::npos) {
+              MS_LOG(WARNING) << "Wait " << wait_times << " seconds, "
+                              << "the task: " << my_name_ << " will be destroyed by TdtHostDestory.";
+              auto queue =
+                device::DataQueueMgr::GetInstance().CreateDataQueue(kAscendDevice, {}, false, 0, nullptr, {});
+              if (queue != nullptr && !queue->Destroy()) {
+                MS_LOG(WARNING) << "Destroy tdt channel failed.";
+              } else {
+                MS_LOG(INFO) << "Destroy tdt channel success.";
+              }

-            // just wait 30 seconds
-            // case1: cpu usage 100%, DeviceQueueOp thread may destroy without thrd_ future
-            if (wait_times > kWaitInterruptTaskTime) {
-              MS_LOG(WARNING) << MyName() << " Thread ID " << ss.str()
-                              << " is not responding. Maybe it's destroyed, task stop.";
-              break;
+              // just wait 30 seconds
+              // case1: cpu usage 100%, DeviceQueueOp thread may destroy without thrd_ future
+              if (wait_times > kWaitInterruptTaskTime) {
+                MS_LOG(WARNING) << MyName() << " Thread ID " << ss.str()
+                                << " is not responding. Maybe it's destroyed, task stop.";
+                break;
+              }
            }
          }
 #endif
--- a/mindspore/ccsrc/pipeline/jit/action.cc
+++ b/mindspore/ccsrc/pipeline/jit/action.cc
@ -1109,7 +1109,7 @@ bool TaskEmitAction(const ResourcePtr &resource) {
    resource->SetResult(kOutput, graph_id);
    return true;
  }
-  std::vector<PrimitivePtr> cut_list = compile::nonlinear_ops;
+  std::vector<PrimitivePtr> cut_list = compile::GetNonlinearOps();
  if (bc_ptr->name() == kMsConvert) {
    cut_list = compile::GetMsNonlinearOps();
  }
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/CMakeLists.txt
@ -7,11 +7,6 @@ file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 list(REMOVE_ITEM DEVICE_SRC_LIST "distribute/mpi_collective_group.cc"
        "distribute/collective_group_wrapper.cc" "distribute/mpi_pycc.cc")

-if(ENABLE_TDTQUE)
-    file(GLOB_RECURSE TDT_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-            ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/minddata/dataset/engine/tdt/tdt_handle.cc)
-endif()
-
 if(ENABLE_MPI AND ENABLE_D)
    set_property(SOURCE "distribute/mpi_pycc.cc"
            PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_data_queue.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_data_queue.cc
@ -16,16 +16,113 @@

 #include "plugin/device/ascend/hal/device/ascend_data_queue.h"
 #include <string>
-#include "plugin/device/ascend/hal/device/ascend_kernel_runtime.h"
+#include <map>
+#include <utility>
+#include "graph/def_types.h"
+#include "common/util/error_manager/error_manager.h"
+#include "include/backend/data_queue/data_queue_mgr.h"
 #include "utils/log_adapter.h"
+#include "plugin/device/ascend/hal/device/ascend_kernel_runtime.h"
+#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
+
 namespace mindspore {
 namespace device {
+namespace {
+const std::map<aclDataType, std::string> kAclTypeToString = {
+  {ACL_INT8, "int8"},       {ACL_UINT8, "uint8"},   {ACL_INT16, "int16"},    {ACL_UINT16, "uint16"},
+  {ACL_INT32, "int32"},     {ACL_UINT32, "uint32"}, {ACL_INT64, "int64"},    {ACL_UINT64, "uint64"},
+  {ACL_FLOAT16, "float16"}, {ACL_FLOAT, "float32"}, {ACL_DOUBLE, "float64"}, {ACL_BOOL, "bool"}};
+
+const std::map<std::string, aclDataType> kStringTypeToAclType = []() -> std::map<std::string, aclDataType> {
+  std::map<std::string, aclDataType> ret;
+  for (const auto &[acl_type, type_str] : kAclTypeToString) {
+    ret.emplace(type_str, acl_type);
+  }
+  return ret;
+}();
+
+constexpr auto kUnknownErrorString = "Unknown error occurred";
+std::map<void **, std::thread *> g_acl_handle_map = {};
+
+constexpr auto kMbufHeadEndOfSequencePos = 128U;
+constexpr auto kEndOfSequenceFlag = 0x5A;
+constexpr auto kTransIdOffset = 64UL;
+constexpr auto kSleepMilliSeconds = 500;
+
+std::atomic<bool> g_acl_destroy_all = false;
+
+bool GetAclDataType(const std::string &str_type, aclDataType *acl_type) {
+  MS_EXCEPTION_IF_NULL(acl_type);
+  auto iter = kStringTypeToAclType.find(str_type);
+  if (iter == kStringTypeToAclType.end()) {
+    MS_LOG(EXCEPTION) << "Invalid type " << str_type;
+  }
+  *acl_type = iter->second;
+  return true;
+}
+
 void CheckRtRetWithError(rtError_t error, const std::string &msg) {
  if (error != RT_ERROR_NONE) {
    MS_LOG(ERROR) << "Rt error: " << msg << " | Error number: " << error;
  }
 }

+void ReportErrorMessage() {
+  const std::string &error_message = ErrorManager::GetInstance().GetErrorMessage();
+  if (!error_message.empty() && error_message.find(kUnknownErrorString) == std::string::npos) {
+    MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
+  }
+}
+}  // namespace
+
+namespace tdt_handle {
+void AddHandle(acltdtChannelHandle **handle, std::thread *use_thread) {
+  if (*handle != nullptr) {
+    auto ret = g_acl_handle_map.emplace(reinterpret_cast<void **>(handle), use_thread);
+    if (!std::get<1>(ret)) {
+      MS_LOG(ERROR) << "Failed to add new handle to acl_handle_map." << std::endl;
+    }
+  }
+}
+
+void DelHandle(acltdtChannelHandle **handle) {
+  void **void_handle = reinterpret_cast<void **>(handle);
+  g_acl_handle_map.erase(void_handle);
+}
+
+bool DestroyHandle() {
+  bool destroy_all = true;
+  for (auto &item : g_acl_handle_map) {
+    acltdtChannelHandle **handle = reinterpret_cast<acltdtChannelHandle **>(item.first);
+    if (*handle != nullptr) {
+      aclError stop_status = acltdtStopChannel(*handle);
+      if (stop_status != ACL_SUCCESS) {
+        MS_LOG(ERROR) << "Failed stop acl data channel and the stop status is " << stop_status << std::endl;
+        return false;
+      }
+      if (item.second != nullptr && item.second->joinable()) {
+        item.second->join();
+      }
+      if (acltdtDestroyChannel(*handle) != ACL_SUCCESS) {
+        MS_LOG(INFO) << "acltdtDestroyChannel failed.";
+        destroy_all = false;
+      } else {
+        *handle = nullptr;
+      }
+    }
+  }
+
+  // clear the map container when all the handle has been destroyed
+  if (destroy_all) {
+    g_acl_handle_map.clear();
+    g_acl_destroy_all = true;
+  }
+  return destroy_all;
+}
+
+bool IsClosed() { return g_acl_destroy_all; }
+}  // namespace tdt_handle
+
 AscendDataQueueDynamic::AscendDataQueueDynamic(const size_t capacity)
    : DataQueue(capacity), stream_(nullptr), node_info_(nullptr) {
  auto context_key = device_context_->device_context_key();
@ -35,12 +132,12 @@ AscendDataQueueDynamic::AscendDataQueueDynamic(const size_t capacity)
  stream_ = runtime_instance->compute_stream();
 }

-BlockQueueStatus_T AscendDataQueueDynamic::Push(std::vector<DataQueueItem> data) {
+DataQueueStatus AscendDataQueueDynamic::Push(std::vector<DataQueueItem> data) {
  for (size_t i = 0; i < data.size(); i++) {
    auto &item = data[i];
    if (item.data_ptr_ == nullptr) {
      MS_LOG(ERROR) << "Invalid Input: ptr: " << item.data_ptr_ << ", len: " << item.data_len_;
-      return ERROR_INPUT;
+      return DataQueueStatus::ERROR_INPUT;
    }
    void *addr = device_context_->device_res_manager_->AllocateMemory(item.data_len_);
    if (addr == nullptr) {
@ -52,26 +149,480 @@ BlockQueueStatus_T AscendDataQueueDynamic::Push(std::vector<DataQueueItem> data)
    item.device_addr_ = addr;
  }
  CheckRtRetWithError(rtStreamSynchronize(stream_), "Call runtime rtStreamSynchronize failed");
-  node_info_[tail_].data_ = data;
+  node_info_[tail_].data_ = std::move(data);
  tail_ = (tail_ + 1) % (capacity_);
  ++size_;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T AscendDataQueueDynamic::Front(std::vector<DataQueueItem> *data) const {
+DataQueueStatus AscendDataQueueDynamic::Front(std::vector<DataQueueItem> *data) const {
  for (auto &item : node_info_[head_].data_) {
    host_release_(item.data_ptr_, item.worker_id_);
  }
  *data = node_info_[head_].data_;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T AscendDataQueueDynamic::Pop() {
+DataQueueStatus AscendDataQueueDynamic::Pop() {
  head_ = (head_ + 1) % (capacity_);
  --size_;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

 bool AscendDataQueueDynamic::Destroy() { return true; }
+
+AscendTdtQueue::AscendTdtQueue(const std::string &channel_name) : DataQueue(0), channel_name_(channel_name) {
+  // init ErrorManager, 0 means success
+  if (ErrorManager::GetInstance().Init() != 0) {
+    MS_LOG(WARNING) << "[Internal Error] Init ErrorManager failed.";
+  }
+  // get device id
+  MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+  device_id_ = MsContext::GetInstance()->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+
+  // create acl tdt handle
+  if (!channel_name.empty()) {
+    acl_handle_ = acltdtCreateChannel(device_id_, channel_name.c_str());
+    if (acl_handle_ == nullptr) {
+      MS_LOG(ERROR) << "Create channel for sending data failed, please check DEVICE ID setting, DEVICE ID that passed "
+                       "into dataset(from context) and training process should be the same.";
+      ReportErrorMessage();
+    }
+  }
+}
+
+AscendTdtQueue::~AscendTdtQueue() {
+  if (acl_handle_ != nullptr) {
+    if (acltdtDestroyChannel(acl_handle_) != ACL_SUCCESS) {
+      MS_LOG(ERROR) << "Failed to destroy channel for tdt queue.";
+      ReportErrorMessage();
+    } else {
+      tdt_handle::DelHandle(&acl_handle_);
+      acl_handle_ = nullptr;
+    }
+  }
+}
+
+bool AscendTdtQueue::Destroy() { return tdt_handle::DestroyHandle(); }
+
+bool AscendTdtQueue::IsOpen() const { return !tdt_handle::IsClosed(); }
+
+DataQueueStatus AscendTdtQueue::Push(std::vector<DataQueueItem> data) {
+  MS_LOG(DEBUG) << "TDT channel name is " << channel_name_ << ".";
+  acltdtDataset *acl_dataset = nullptr;
+  auto ret = Translate(data, &acl_dataset);
+  if (!ret) {
+    DestroyAclDataset(acl_dataset);
+    MS_LOG(ERROR) << "Converting into TDT tensor failed!";
+    return DataQueueStatus::INTERNAL_ERROR;
+  }
+
+  // Data prefetch only when PS mode enables cache.
+  if (acltdtGetDatasetSize(acl_dataset) > 0) {
+    acltdtDataItem *item0 = acltdtGetDataItem(acl_dataset, 0);
+    std::string item_type;
+    ParseType(acltdtGetDataTypeFromItem(item0), &item_type);
+    if (!ps::PsDataPrefetch::GetInstance().PrefetchData(channel_name_, acltdtGetDataAddrFromItem(item0),
+                                                        acltdtGetDataSizeFromItem(item0), item_type)) {
+      MS_LOG(ERROR) << "PrefetchData failed in when pre-processing sending data.";
+      return DataQueueStatus::INTERNAL_ERROR;
+    }
+  }
+  auto status = acltdtSendTensor(acl_handle_, acl_dataset, -1);
+  DestroyAclDataset(acl_dataset);
+  if (status != ACL_SUCCESS) {
+    // if the device_queue thread had been interrupted by master, just print warning and return success
+    if (tdt_handle::IsClosed()) {
+      MS_LOG(WARNING) << "Device queue thread had been interrupted by TdtHandle::DestroyHandle, you can ignore "
+                      << "the above error: 'failed to send...'. In this scenario, the training ends first without "
+                      << "using all epoch(s) data, and the data preprocessing is blocked by the data "
+                      << "transmission channel on the device side. So we force the data transmission channel to stop.";
+      return DataQueueStatus::SUCCESS;
+    }
+    ReportErrorMessage();
+    MS_LOG(ERROR) << "Tdt Send data failed.";
+    return DataQueueStatus::INTERNAL_ERROR;
+  }
+
+  return DataQueueStatus::SUCCESS;
+}
+
+void AscendTdtQueue::ParseType(aclDataType acl_data_type, std::string *data_type) {
+  auto type_iter = kAclTypeToString.find(acl_data_type);
+  if (type_iter == kAclTypeToString.end()) {
+    MS_LOG(EXCEPTION) << "Got unsupported acl datatype: " << acl_data_type;
+  }
+  *data_type = type_iter->second;
+}
+
+bool AscendTdtQueue::Translate(const std::vector<DataQueueItem> &data, acltdtDataset **output_acl_dataset) {
+  auto acl_dataset = acltdtCreateDataset();
+  if (acl_dataset == nullptr) {
+    MS_LOG(ERROR) << "Create tdt dataset failed.";
+    return false;
+  }
+  bool status = AssembleTensor2AclDataset(data, acl_dataset);
+  if (!status) {
+    DestroyAclDataset(acl_dataset);
+    MS_LOG(ERROR) << "Assemble tensor row to tdt dataset failed.";
+    return false;
+  }
+
+  *output_acl_dataset = acl_dataset;
+  return true;
+}
+
+bool AscendTdtQueue::AssembleTensor2AclDataset(const std::vector<DataQueueItem> &data, acltdtDataset *acl_dataset) {
+  if (data.empty()) {
+    acltdtDataItem *acl_data =
+      acltdtCreateDataItem(acltdtTensorType::ACL_TENSOR_DATA_END_OF_SEQUENCE, nullptr, 0, ACL_BOOL, nullptr, 0);
+    if (acl_data == nullptr) {
+      MS_LOG(ERROR) << "Create data item failed when send empty data.";
+      return false;
+    }
+    if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_SUCCESS) {
+      if (acltdtDestroyDataItem(acl_data) != ACL_SUCCESS) {
+        MS_LOG(ERROR) << "Destroy data item failed when send empty data.";
+      }
+      MS_LOG(ERROR) << "Add data item to tdt dataset failed when send data.";
+      return false;
+    }
+    return true;
+  }
+
+  for (const auto &ts : data) {
+    aclDataType acl_type;
+    acltdtDataItem *acl_data = nullptr;
+    if (!GetAclDataType(ts.data_type_, &acl_type)) {
+      MS_LOG(ERROR) << "Convert type " << ts.data_type_ << " to acl type failed.";
+      return false;
+    }
+
+    const auto &shape = ts.shapes_;
+    std::string shape_str = "[";
+    for (auto dim : shape) {
+      (void)shape_str.append(std::to_string(dim)).append(",");
+    }
+    shape_str.pop_back();
+    (void)shape_str.append("]");
+
+    void *data_ptr = ts.data_ptr_;
+    size_t data_size = ts.data_len_;
+
+    acl_data = acltdtCreateDataItem(acltdtTensorType::ACL_TENSOR_DATA_TENSOR, (shape.empty() ? nullptr : &shape[0]),
+                                    shape.size(), acl_type, data_ptr, data_size);
+    if (acl_data == nullptr) {
+      MS_LOG(ERROR) << "Create data item failed when send data.";
+      return false;
+    }
+    if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_SUCCESS) {
+      if (acltdtDestroyDataItem(acl_data) != ACL_SUCCESS) {
+        MS_LOG(ERROR) << "Destroy data item failed when send data with type ACL_TENSOR_DATA_TENSOR.";
+      }
+      MS_LOG(INFO) << "Add data item to tdt dataset failed when send data.";
+      return false;
+    }
+
+    MS_LOG(DEBUG) << "TDT data type is TDT_TENSOR, tensor type is " << acl_type << ", tensor shape is " << shape_str
+                  << ", data length is " << data_size << ".";
+  }
+
+  return true;
+}
+
+void AscendTdtQueue::DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item) {
+  if (include_data_item) {
+    for (size_t i = 0; i < acltdtGetDatasetSize(acl_dataset); i++) {
+      if (acltdtDestroyDataItem(acltdtGetDataItem(acl_dataset, i)) != ACL_SUCCESS) {
+        MS_LOG(EXCEPTION) << "Destroy data item failed when send data.";
+      }
+    }
+  }
+
+  if (acltdtDestroyDataset(acl_dataset) != ACL_SUCCESS) {
+    MS_LOG(EXCEPTION) << "Destroy tdt dataset failed when send data.";
+  }
+}
+
+AscendHostQueue::AscendHostQueue(const std::string &channel_name)
+    : DataQueue(0), channel_name_(channel_name), queue_id_to_trans_id_map_(), queue_id_(0) {
+  // init ErrorManager, 0 means success
+  if (ErrorManager::GetInstance().Init() != 0) {
+    MS_LOG(WARNING) << "[Internal Error] Init ErrorManager failed.";
+  }
+  // get device id
+  MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+  device_id_ = MsContext::GetInstance()->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+
+  if (!HostQueueInit()) {
+    MS_LOG(WARNING) << "Host queue init failed.";
+  }
+}
+
+DataQueueStatus AscendHostQueue::Push(std::vector<DataQueueItem> data) {
+  if (!SendDataByHostQueue(data)) {
+    return DataQueueStatus::INTERNAL_ERROR;
+  }
+
+  return DataQueueStatus::SUCCESS;
+}
+
+bool AscendHostQueue::HostQueueInit() {
+  auto ret = rtSetDevice(device_id_);
+  if (ret != ACL_RT_SUCCESS) {
+    MS_LOG(ERROR) << "call rtSetDevice failed, ret =" << ret;
+    return false;
+  }
+
+  ret = rtMemQueueInit(device_id_);
+  if (ret != ACL_RT_SUCCESS && ret != ACL_ERROR_RT_REPEATED_INIT) {
+    MS_LOG(ERROR) << "call rtMemQueueInit failed, ret =" << ret;
+    return false;
+  }
+
+  rtMemQueueAttr_t attr = {};
+  auto mem_ret = memset_s(attr.name, RT_MQ_MAX_NAME_LEN, 0, RT_MQ_MAX_NAME_LEN);
+  if (mem_ret != EOK) {
+    MS_LOG(ERROR) << "call memset_s failed, ret =" << mem_ret;
+    return false;
+  }
+  mem_ret = memcpy_s(attr.name, RT_MQ_MAX_NAME_LEN, channel_name_.c_str(), channel_name_.size() + 1);
+  if (mem_ret != EOK) {
+    MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
+    return false;
+  }
+
+  attr.depth = 128U;
+  attr.workMode = RT_MQ_MODE_DEFAULT;
+  attr.flowCtrlFlag = false;
+  attr.flowCtrlDropTime = 0U;
+  attr.overWriteFlag = false;
+  ret = rtMemQueueCreate(device_id_, &attr, &queue_id_);
+  if (ret != ACL_RT_SUCCESS) {
+    MS_LOG(ERROR) << "call rtMemQueueCreate failed, ret =" << ret;
+    return false;
+  }
+
+  rtMemBuffCfg_t buff_cfg = {};
+  ret = rtMbufInit(&buff_cfg);
+  if (ret != ACL_RT_SUCCESS && ret != ACL_ERROR_RT_REPEATED_INIT) {
+    MS_LOG(ERROR) << "call rtMbufInit failed, ret =" << ret;
+    return false;
+  }
+  const std::lock_guard<std::mutex> lk(queue_id_to_trans_id_map_mutex_);
+  (void)queue_id_to_trans_id_map_.emplace(queue_id_, 0UL);
+
+  return true;
+}
+
+bool AscendHostQueue::SendDataByHostQueue(const std::vector<DataQueueItem> &data) {
+  bool status;
+  bool is_need_resend = false;
+  void *buff = nullptr;
+  // Status status;
+  if (!LaunchTensor2MBuff(data, &buff)) {
+    return false;
+  }
+  do {
+    if (is_need_resend) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(kSleepMilliSeconds));
+    }
+    status = EnqueueData(buff, &is_need_resend);
+  } while (status && is_need_resend);
+  return true;
+}
+
+bool AscendHostQueue::SetTransId4MBuf(void **buff) {
+  void *head_buff = nullptr;
+  uint64_t head_size = 0UL;
+  auto ret = rtMbufGetPrivInfo(*buff, &head_buff, &head_size);
+  if (ret != ACL_RT_SUCCESS) {
+    MS_LOG(ERROR) << "call rtMbufGetPrivInfo failed, ret =" << ret;
+    return false;
+  }
+  uint64_t *trans_id = reinterpret_cast<uint64_t *>(static_cast<uint8_t *>(head_buff) + head_size - kTransIdOffset);
+  const std::lock_guard<std::mutex> lk(queue_id_to_trans_id_map_mutex_);
+  *trans_id = ++queue_id_to_trans_id_map_[queue_id_];
+  MS_LOG(DEBUG) << "host queue[" << queue_id_ << "] set trans id[" << *trans_id << "] success";
+  return true;
+}
+
+bool AscendHostQueue::LaunchTensor2MBuff(const std::vector<DataQueueItem> &data, void **buff) {
+  std::vector<DataItemInfo> items;
+  if (!CreateDataItemInfos(data, &items)) {
+    return false;
+  }
+  if (!SerializeDataItemInfos(&items, buff)) {
+    return false;
+  }
+  if (!SetTransId4MBuf(buff)) {
+    return false;
+  }
+  return true;
+}
+
+bool AscendHostQueue::EnqueueData(void *buff, bool *need_resend) {
+  MS_EXCEPTION_IF_NULL(need_resend);
+  *need_resend = false;
+  auto rt_error = rtSetDevice(device_id_);
+  if (rt_error != ACL_RT_SUCCESS) {
+    MS_LOG(ERROR) << "call rtSetDevice device failed, ret=" << rt_error;
+    return false;
+  }
+  rt_error = rtMemQueueEnQueue(device_id_, queue_id_, buff);
+  if (rt_error == RT_ERROR_NONE) {
+    return true;
+  } else if (rt_error == ACL_ERROR_RT_QUEUE_FULL) {
+    *need_resend = true;
+    MS_LOG(DEBUG) << "queue[" << queue_id_ << "] is full, need call rtMemQueueEnQueue again";
+  } else {
+    HostQueueFreeBuff(buff);
+    MS_LOG(ERROR) << "host enqueue queue[" << queue_id_ << "] failed, ret = " << rt_error;
+    return false;
+  }
+  return true;
+}
+
+bool AscendHostQueue::CreateDataItemInfos(const std::vector<DataQueueItem> &data, std::vector<DataItemInfo> *items) {
+  MS_EXCEPTION_IF_NULL(items);
+  if (data.empty()) {
+    items->emplace_back(BuildDataItemInfo(ACL_TENSOR_DATA_END_OF_SEQUENCE, ACL_BOOL, nullptr, 0UL, nullptr, 0UL));
+    return true;
+  }
+
+  for (auto &ts : data) {
+    aclDataType acl_type;
+    if (!GetAclDataType(ts.data_type_, &acl_type)) {
+      MS_LOG(ERROR) << "Convert type " << ts.data_type_ << " to acl type failed.";
+      return false;
+    }
+
+    const auto &shape = ts.shapes_;
+    void *data_ptr = ts.data_ptr_;
+    size_t data_size = ts.data_len_;
+
+    if (ts.data_type_ != "string") {
+      items->emplace_back(BuildDataItemInfo(ACL_TENSOR_DATA_TENSOR, static_cast<int32_t>(acl_type),
+                                            (shape.empty() ? nullptr : &shape[0]), shape.size(), data_ptr, data_size));
+    } else {
+      MS_LOG(ERROR) << "Create data item failed when send data with type:" << ts.data_type_;
+    }
+  }
+  return true;
+}
+
+bool AscendHostQueue::SerializeDataItemInfos(std::vector<DataItemInfo> *items, void **buff) {
+  size_t cnt = items->size();
+  size_t total_size = 0UL;
+  for (size_t i = 0UL; i < cnt; ++i) {
+    (*items)[i].ctrl_info.cur_cnt = i;
+    (*items)[i].ctrl_info.cnt = cnt;
+    total_size +=
+      sizeof(DataItemInfo::ItemInfo) + (*items)[i].ctrl_info.dim_num * sizeof(int64_t) + (*items)[i].ctrl_info.data_len;
+  }
+
+  auto rt_error = rtMbufAlloc(buff, total_size);
+  if (rt_error != ACL_RT_SUCCESS) {
+    MS_LOG(ERROR) << "call rtMbufAlloc with size[" << total_size << "] failed, ret = " << rt_error;
+    return false;
+  }
+
+  void *data = nullptr;
+  rt_error = rtMbufGetBuffAddr(*buff, &data);
+  if (rt_error != ACL_RT_SUCCESS) {
+    (void)rtMbufFree(*buff);
+    MS_LOG(ERROR) << "call rtMbufGetBuffAddr with size[" << total_size << "] failed, ret = " << rt_error;
+    return false;
+  }
+
+  void *head_buf = nullptr;
+  uint64_t head_size = 0UL;
+  rt_error = rtMbufGetPrivInfo(*buff, &head_buf, &head_size);
+  if (rt_error != ACL_RT_SUCCESS) {
+    (void)rtMbufFree(*buff);
+    MS_LOG(ERROR) << "call rtMbufGetPrivInfo failed, ret =" << rt_error;
+    return false;
+  }
+  if ((head_buf != nullptr) && (head_size > kMbufHeadEndOfSequencePos)) {
+    MS_LOG(DEBUG) << "host queue set end_of_sequence mbuf head.";
+  }
+
+  size_t offset = 0UL;
+  for (size_t i = 0UL; i < cnt; ++i) {
+    auto mem_ret = memcpy_s(::ge::ValueToPtr(::ge::PtrToValue(data) + offset), sizeof(DataItemInfo::ItemInfo),
+                            &((*items)[i].ctrl_info), sizeof(DataItemInfo::ItemInfo));
+    if (mem_ret != EOK) {
+      (void)rtMbufFree(*buff);
+      MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
+      return false;
+    }
+    offset += sizeof(DataItemInfo::ItemInfo);
+
+    for (size_t j = 0UL; j < (*items)[i].ctrl_info.dim_num; ++j) {
+      mem_ret = memcpy_s(::ge::ValueToPtr(::ge::PtrToValue(data) + offset), sizeof(int64_t), &((*items)[i].dims[j]),
+                         sizeof(int64_t));
+      if (mem_ret != EOK) {
+        (void)rtMbufFree(*buff);
+        MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
+        return false;
+      }
+      offset += sizeof(int64_t);
+    }
+
+    if ((*items)[i].ctrl_info.data_len == 0UL) {
+      continue;
+    }
+
+    mem_ret = memcpy_s(::ge::ValueToPtr(::ge::PtrToValue(data) + offset), (*items)[i].ctrl_info.data_len,
+                       (*items)[i].data_ptr, (*items)[i].ctrl_info.data_len);
+    if (mem_ret != EOK) {
+      (void)rtMbufFree(*buff);
+      MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
+      return false;
+    }
+    offset += (*items)[i].ctrl_info.data_len;
+  }
+
+  return true;
+}
+
+AscendHostQueue::DataItemInfo AscendHostQueue::BuildDataItemInfo(acltdtTensorType acl_data_type, int32_t tensor_type,
+                                                                 const int64_t *dims, size_t dim_size, void *data_ptr,
+                                                                 uint64_t data_len) {
+  DataItemInfo item = {};
+  item.ctrl_info.data_type = static_cast<int32_t>(acl_data_type);
+  item.ctrl_info.tensor_type = tensor_type;
+  item.ctrl_info.dim_num = dim_size;
+  item.ctrl_info.data_len = data_len;
+  item.dims = std::vector<int64_t>(dims, dims + dim_size);
+  item.data_ptr = data_ptr;
+  return item;
+}
+
+void AscendHostQueue::HostQueueFreeBuff(void *buff) {
+  auto rt_error = rtMbufFree(buff);
+  if (rt_error != ACL_RT_SUCCESS) {
+    MS_LOG(ERROR) << "call rtMbufFree failed, ret=" << rt_error;
+  }
+}
+
+namespace {
+std::shared_ptr<DataQueue> CreateAscendDataQueue(const std::string &channel_name, bool dynamic_shape, size_t capacity,
+                                                 void *, const std::vector<size_t> &) {
+  if (dynamic_shape) {
+    return std::make_shared<AscendDataQueueDynamic>(capacity);
+  }
+
+  int32_t is_heterogeneous = 0;
+  (void)rtGetIsHeterogenous(&is_heterogeneous);
+  if (is_heterogeneous != 0) {
+    return std::make_shared<AscendHostQueue>(channel_name);
+  } else {
+    return std::make_shared<AscendTdtQueue>(channel_name);
+  }
+}
+
+REGISTER_DATA_QUEUE_CREATOR(kAscendDevice, CreateAscendDataQueue);
+}  // namespace
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_data_queue.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_data_queue.h
@ -20,23 +20,27 @@
 #include <unistd.h>
 #include <memory>
 #include <vector>
+#include <string>
+#include <map>
 #include <functional>
 #include "runtime/hardware/device_context_manager.h"
-#include "runtime/data_queue/data_queue.h"
-#include "runtime/rt.h"
+#include "include/backend/data_queue/data_queue.h"
 #include "include/backend/visible.h"
+#include "runtime/rt.h"
+#include "acl/acl_tdt.h"

 namespace mindspore {
 namespace device {
 class BACKEND_EXPORT AscendDataQueueDynamic : public DataQueue {
 public:
  explicit AscendDataQueueDynamic(const size_t capacity);
-  virtual ~AscendDataQueueDynamic() = default;
+  ~AscendDataQueueDynamic() override = default;

-  BlockQueueStatus_T Push(std::vector<DataQueueItem> data);
-  BlockQueueStatus_T Front(std::vector<DataQueueItem> *data) const;
-  BlockQueueStatus_T Pop();
-  bool Destroy();
+  DataQueueStatus Push(std::vector<DataQueueItem> data) override;
+  DataQueueStatus Front(std::vector<DataQueueItem> *data) const override;
+  DataQueueStatus Pop() override;
+  bool Destroy() override;
+  void SetThreadDevice() override {}

 private:
  struct NodeInfo {
@ -45,7 +49,82 @@ class BACKEND_EXPORT AscendDataQueueDynamic : public DataQueue {
  rtStream_t stream_;
  std::unique_ptr<NodeInfo[]> node_info_;
 };
+
+namespace tdt_handle {
+void AddHandle(acltdtChannelHandle **handle, std::thread *use_thread);
+bool DestroyHandle();
+void DelHandle(acltdtChannelHandle **handle);
+bool IsClosed();
+}  // namespace tdt_handle
+
+class AscendTdtQueue : public DataQueue {
+ public:
+  explicit AscendTdtQueue(const std::string &channel_name);
+  ~AscendTdtQueue() override;
+
+  bool IsOpen() const override;
+
+  DataQueueStatus Push(std::vector<DataQueueItem> data) override;
+  DataQueueStatus Front(std::vector<DataQueueItem> *data) const override { return DataQueueStatus::SUCCESS; }
+  DataQueueStatus Pop() override { return DataQueueStatus::SUCCESS; }
+  bool Destroy() override;
+  void SetThreadDevice() override {}
+
+ private:
+  void DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item = true);
+  bool AssembleTensor2AclDataset(const std::vector<DataQueueItem> &data, acltdtDataset *acl_dataset);
+  void ParseType(aclDataType acl_data_type, std::string *data_type);
+  bool Translate(const std::vector<DataQueueItem> &data, acltdtDataset **output_acl_dataset);
+
+  acltdtChannelHandle *acl_handle_;
+  std::string channel_name_;
+  uint32_t device_id_;
+};
+
+class AscendHostQueue : public DataQueue {
+ public:
+  explicit AscendHostQueue(const std::string &channel_name);
+  ~AscendHostQueue() override = default;
+
+  DataQueueStatus Push(std::vector<DataQueueItem> data) override;
+  DataQueueStatus Front(std::vector<DataQueueItem> *data) const override { return DataQueueStatus::SUCCESS; }
+  DataQueueStatus Pop() override { return DataQueueStatus::SUCCESS; }
+  bool Destroy() override { return true; }
+  void SetThreadDevice() override {}
+
+  struct DataItemInfo {
+    struct ItemInfo {
+      int32_t version;
+      int32_t data_type;
+      uint32_t cur_cnt;
+      uint32_t cnt;
+      int32_t tensor_type;
+      uint32_t dim_num;
+      char reserved[32];
+      uint64_t data_len;
+    } ctrl_info;
+    std::vector<int64_t> dims;
+    void *data_ptr;
+  };
+
+ private:
+  bool HostQueueInit();
+  bool SendDataByHostQueue(const std::vector<DataQueueItem> &data);
+  bool SetTransId4MBuf(void **buff);
+  bool LaunchTensor2MBuff(const std::vector<DataQueueItem> &data, void **buff);
+  bool EnqueueData(void *buff, bool *need_resend);
+  bool CreateDataItemInfos(const std::vector<DataQueueItem> &data, std::vector<DataItemInfo> *items);
+  bool SerializeDataItemInfos(std::vector<DataItemInfo> *items, void **buff);
+  DataItemInfo BuildDataItemInfo(acltdtTensorType acl_data_type, int32_t tensor_type, const int64_t *dims,
+                                 size_t dim_size, void *data_ptr, uint64_t data_len);
+  void HostQueueFreeBuff(void *buff);
+
+  std::string channel_name_;
+  uint32_t device_id_;
+  std::mutex queue_id_to_trans_id_map_mutex_;
+  std::map<uint32_t, uint64_t> queue_id_to_trans_id_map_;
+  uint32_t queue_id_;
+};
 }  // namespace device
 }  // namespace mindspore
-
 #endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_BLOCKING_QUEUE_H_
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_kernel_runtime.cc
@ -62,10 +62,7 @@
 #endif
 #include "include/common/utils/config_manager.h"
 #include "plugin/device/ascend/hal/hccl_adapter/hccl_adapter.h"
-#ifdef ENABLE_TDTQUE
-#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h"
-using mindspore::dataset::TdtHandle;
-#endif
+#include "plugin/device/ascend/hal/device/ascend_data_queue.h"
 #ifdef ENABLE_DUMP_IR
 #include "include/common/debug/rdr/recorder_manager.h"
 #endif
@ -1102,10 +1099,10 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph &graph) {
 #ifndef ENABLE_SECURITY
    DumpTaskExceptionInfo(graph);
 #endif
-#ifdef ENABLE_TDTQUE
+#ifdef WITH_BACKEND
    // Run task error, we should call TdtHostDestroy to release tdt to avoid DeviceQueueOp hostPush hung
    // case1: cpu usage 100% cause thread/process exit, but some tdt thread remain in backend
-    if (!TdtHandle::DestroyHandle()) {
+    if (!tdt_handle::DestroyHandle()) {
      MS_LOG(WARNING) << "Destroy tdt channel failed.";
    } else {
      MS_LOG(INFO) << "Destroy tdt channel success.";
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/tensorprint_utils.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tensorprint_utils.cc
@ -27,7 +27,7 @@
 #include "ir/dtype/type.h"
 #include "acl/acl_tdt.h"
 #include "proto/print.pb.h"
-#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h"
+#include "plugin/device/ascend/hal/device/ascend_data_queue.h"

 namespace py = pybind11;
 namespace mindspore::device::ascend {
@ -392,7 +392,7 @@ void CreateTensorPrintThread(const PrintThreadCrt &ctr) {
               << MsContext::GetInstance()->get_param<uint32_t>(MS_CTX_TSD_REF) << ".";
  std::string print_file_path = MsContext::GetInstance()->get_param<std::string>(MS_CTX_PRINT_FILE_PATH);
  g_acl_tdt_print = ctr(print_file_path, g_acl_handle);
-  dataset::TdtHandle::AddHandle(&g_acl_handle, &g_acl_tdt_print);
+  tdt_handle::AddHandle(&g_acl_handle, &g_acl_tdt_print);
 }

 void DestroyTensorPrintThread() {
@ -419,7 +419,7 @@ void DestroyTensorPrintThread() {
    MS_LOG(ERROR) << "Failed destroy acl channel and the destroyed_status is " << destroyed_status << std::endl;
    return;
  }
-  dataset::TdtHandle::DelHandle(&g_acl_handle);
+  tdt_handle::DelHandle(&g_acl_handle);
  MS_LOG(INFO) << "Succeed destroy acl channel";
 }
 }  // namespace mindspore::device::ascend
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_deprecated_interface.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_deprecated_interface.cc
@ -297,12 +297,10 @@ bool AscendDeprecatedInterface::OpenTsd(const std::shared_ptr<MsContext> &ms_con
    MS_LOG(EXCEPTION) << "Device " << device_id << " call rtSetDevice failed, ret[" << static_cast<int>(ret) << "]";
  }
  ms_context_ptr->increase_param<uint32_t>(MS_CTX_TSD_REF);
-#ifdef ENABLE_TDTQUE
  auto thread_crt = [](const std::string &path, const acltdtChannelHandle *acl_handle) {
    return std::thread(TensorPrint(path, acl_handle));
  };
  CreateTensorPrintThread(thread_crt);
-#endif
  return true;
 }

@ -315,10 +313,8 @@ bool AscendDeprecatedInterface::CloseTsd(const std::shared_ptr<MsContext> &ms_co
  ms_context_ptr->decrease_param<uint32_t>(MS_CTX_TSD_REF);
  if (force || ms_context_ptr->get_param<uint32_t>(MS_CTX_TSD_REF) == 0) {
    ms_context_ptr->set_param<uint32_t>(MS_CTX_TSD_REF, 0);
-#ifdef ENABLE_TDTQUE
    pybind11::gil_scoped_release gil_release;
    DestroyTensorPrintThread();
-#endif
    if (ErrorManager::GetInstance().Init() != 0) {
      MS_LOG(WARNING) << "Init ascend error manager failed, some ascend error log may be left out.";
    }
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_res_manager.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_res_manager.cc
@ -15,7 +15,7 @@
 */

 #include "plugin/device/ascend/hal/hardware/ascend_device_res_manager.h"
-#include "runtime/data_queue/data_queue_mgr.h"
+#include "include/backend/data_queue/data_queue_mgr.h"
 #include "runtime/rt.h"

 namespace mindspore {
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/rts/getnext_dynamic.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/rts/getnext_dynamic.cc
@ -29,7 +29,7 @@
 #include "acl/acl_rt.h"
 #include "runtime/device/kernel_runtime.h"
 #include "utils/ms_context.h"
-#include "runtime/data_queue/data_queue_mgr.h"
+#include "include/backend/data_queue/data_queue_mgr.h"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_data_queue.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_data_queue.cc
@ -15,28 +15,55 @@
 */
 #include "plugin/device/gpu/hal/device/gpu_data_queue.h"
 #include <string>
-#include "plugin/device/gpu/hal/device/queue_common.h"
+#include <utility>
+#include "include/backend/data_queue/data_queue_mgr.h"
 #include "utils/ms_context.h"
+#include "plugin/device/gpu/hal/device/queue_common.h"
 #include "plugin/device/gpu/hal/device/gpu_device_manager.h"
+
 namespace mindspore {
 namespace device {
+namespace {
+void SetThreadToDeviceId(uint32_t device_id) {
+  // Without cudaSetDevice cuda memory will allocate on GPU:0 as default
+  // and will overload in distribute scenario.
+  auto ret = cudaSetDevice(device_id);
+  if (ret != cudaSuccess) {
+    MS_LOG(EXCEPTION) << "cudaSetDevice failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
+  }
+}
+
+std::shared_ptr<void> AllocHostMemory(size_t size) {
+  void *ptr;
+  auto ret = cudaHostAlloc(&ptr, size, cudaHostAllocDefault);
+  if (ret != cudaSuccess) {
+    MS_LOG(EXCEPTION) << "cudaHostAlloc failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
+  }
+
+  return std::shared_ptr<void>(ptr, [](void *ptr) -> void {
+    if (ptr != nullptr) {
+      (void)cudaFreeHost(ptr);
+    }
+  });
+}
+}  // namespace
 GpuDataQueueDynamic::GpuDataQueueDynamic(const size_t capacity) : DataQueue(capacity), node_info_(nullptr) {
  node_info_ = std::make_unique<NodeInfo[]>(capacity);
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  const std::string &device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
-  uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
-  device_context_ = DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
+  device_id_ = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  device_context_ = DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id_});
  device_context_->Initialize();
  stream_ = reinterpret_cast<cudaStream_t>(gpu::GPUDeviceManager::GetInstance().default_stream());
 }

-BlockQueueStatus_T GpuDataQueueDynamic::Push(std::vector<DataQueueItem> data) {
+DataQueueStatus GpuDataQueueDynamic::Push(std::vector<DataQueueItem> data) {
  for (size_t i = 0; i < data.size(); i++) {
    auto &item = data[i];
    if (item.data_ptr_ == nullptr) {
      MS_LOG(ERROR) << "Invalid Input: ptr: " << item.data_ptr_ << ", len: " << item.data_len_;
-      return ERROR_INPUT;
+      return DataQueueStatus::ERROR_INPUT;
    }
    void *addr = device_context_->device_res_manager_->AllocateMemory(item.data_len_);
    CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(addr, item.data_ptr_, item.data_len_, cudaMemcpyHostToDevice, stream_),
@ -47,42 +74,49 @@ BlockQueueStatus_T GpuDataQueueDynamic::Push(std::vector<DataQueueItem> data) {
  node_info_[tail_].event_.reset(new cudaEvent_t());
  CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&(*(node_info_[tail_].event_))), "Cuda Create Event Failed");
  CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(*(node_info_[tail_].event_), stream_), "Cuda Create Event Failed");
-  node_info_[tail_].data_ = data;
+  node_info_[tail_].data_ = std::move(data);
  tail_ = (tail_ + 1) % (capacity_);
  ++size_;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T GpuDataQueueDynamic::Front(std::vector<DataQueueItem> *data) const {
+DataQueueStatus GpuDataQueueDynamic::Front(std::vector<DataQueueItem> *data) const {
  CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(*(node_info_[head_].event_)), "Cuda Event Syn Failed");
  CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(*(node_info_[head_].event_)), "Cuda Destroy Event Failed");
  for (auto &item : node_info_[head_].data_) {
    host_release_(item.data_ptr_, item.worker_id_);
  }
  *data = node_info_[head_].data_;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T GpuDataQueueDynamic::Pop() {
+DataQueueStatus GpuDataQueueDynamic::Pop() {
  head_ = (head_ + 1) % (capacity_);
  --size_;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

 bool GpuDataQueueDynamic::Destroy() { return true; }

-GpuQueue::GpuQueue(void *addr, const std::vector<size_t> &shape, const size_t &capacity)
+void GpuDataQueueDynamic::SetThreadDevice() { SetThreadToDeviceId(device_id_); }
+
+std::shared_ptr<void> GpuDataQueueDynamic::AllocHostMem(size_t size) { return AllocHostMemory(size); }
+
+GpuQueue::GpuQueue(size_t capacity, void *addr, const std::vector<size_t> &shape)
    : DataQueue(capacity), buffer_(addr), shape_(shape), len_(0), stream_(0), node_info_(nullptr) {
  CHECK_CUDA_RET_WITH_ERROR(cudaStreamCreate(&stream_), "Cuda Create Stream Failed");
  node_info_ = std::make_unique<NodeInfo[]>(capacity);
  for (auto item : shape) {
    len_ += item;
  }
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  device_id_ = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
 }

 GpuQueue::~GpuQueue() { buffer_ = nullptr; }

-BlockQueueStatus_T GpuQueue::Push(std::vector<DataQueueItem> data) {
+DataQueueStatus GpuQueue::Push(std::vector<DataQueueItem> data) {
  void *addr = reinterpret_cast<uint8_t *>(buffer_) + tail_ * len_;
  for (size_t i = 0; i < data.size(); i++) {
    auto &item = data[i];
@ -97,7 +131,7 @@ BlockQueueStatus_T GpuQueue::Push(std::vector<DataQueueItem> data) {
                    << shape_[i] << "), "
                    << "you need to call network.set_inputs() to "
                       "configure dynamic dims of input data before running the network";
-      return ERROR_INPUT;
+      return DataQueueStatus::ERROR_INPUT;
    }

    CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(addr, item.data_ptr_, item.data_len_, cudaMemcpyHostToDevice, stream_),
@ -109,26 +143,26 @@ BlockQueueStatus_T GpuQueue::Push(std::vector<DataQueueItem> data) {
  node_info_[tail_].event_.reset(new cudaEvent_t());
  CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&(*(node_info_[tail_].event_))), "Cuda Create Event Failed");
  CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(*(node_info_[tail_].event_), stream_), "Cuda Create Event Failed");
-  node_info_[tail_].data_ = data;
+  node_info_[tail_].data_ = std::move(data);
  tail_ = (tail_ + 1) % (capacity_);
  ++size_;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T GpuQueue::Front(std::vector<DataQueueItem> *data) const {
+DataQueueStatus GpuQueue::Front(std::vector<DataQueueItem> *data) const {
  CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(*(node_info_[head_].event_)), "Cuda Event Syn Failed");
  CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(*(node_info_[head_].event_)), "Cuda Destroy Event Failed");
  for (auto &item : node_info_[head_].data_) {
    host_release_(item.data_ptr_, item.worker_id_);
  }
  *data = node_info_[head_].data_;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T GpuQueue::Pop() {
+DataQueueStatus GpuQueue::Pop() {
  head_ = (head_ + 1) % (capacity_);
  --size_;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

 bool GpuQueue::Destroy() {
@ -143,5 +177,22 @@ bool GpuQueue::Destroy() {
    return true;
  }
 }
+
+void GpuQueue::SetThreadDevice() { SetThreadToDeviceId(device_id_); }
+
+std::shared_ptr<void> GpuQueue::AllocHostMem(size_t size) { return AllocHostMemory(size); }
+
+namespace {
+std::shared_ptr<DataQueue> CreateGpuDataQueue(const std::string &, bool dynamic_shape, size_t capacity, void *addr,
+                                              const std::vector<size_t> &shape) {
+  if (dynamic_shape) {
+    return std::make_shared<GpuDataQueueDynamic>(capacity);
+  }
+
+  return std::make_shared<GpuQueue>(capacity, addr, shape);
+}
+
+REGISTER_DATA_QUEUE_CREATOR(kGPUDevice, CreateGpuDataQueue);
+}  // namespace
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_data_queue.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_data_queue.h
@ -22,7 +22,7 @@
 #include <memory>
 #include <vector>
 #include <functional>
-#include "runtime/data_queue/data_queue.h"
+#include "include/backend/data_queue/data_queue.h"
 #include "runtime/hardware/device_context_manager.h"
 #include "include/backend/visible.h"

@ -31,12 +31,15 @@ namespace device {
 class BACKEND_EXPORT GpuDataQueueDynamic : public DataQueue {
 public:
  explicit GpuDataQueueDynamic(const size_t capacity);
-  virtual ~GpuDataQueueDynamic() = default;
+  ~GpuDataQueueDynamic() override = default;

-  BlockQueueStatus_T Push(std::vector<DataQueueItem> data);
-  BlockQueueStatus_T Front(std::vector<DataQueueItem> *data) const;
-  BlockQueueStatus_T Pop();
-  bool Destroy();
+  DataQueueStatus Push(std::vector<DataQueueItem> data) override;
+  DataQueueStatus Front(std::vector<DataQueueItem> *data) const override;
+  DataQueueStatus Pop() override;
+  bool Destroy() override;
+
+  void SetThreadDevice() override;
+  std::shared_ptr<void> AllocHostMem(size_t size) override;

 private:
  struct NodeInfo {
@ -48,17 +51,21 @@ class BACKEND_EXPORT GpuDataQueueDynamic : public DataQueue {

  cudaStream_t stream_;
  std::unique_ptr<NodeInfo[]> node_info_;
+  uint32_t device_id_;
 };

 class BACKEND_EXPORT GpuQueue : public DataQueue {
 public:
-  GpuQueue(void *addr, const std::vector<size_t> &shape, const size_t &capacity);
-  virtual ~GpuQueue();
+  GpuQueue(size_t capacity, void *addr, const std::vector<size_t> &shape);
+  ~GpuQueue() override;

-  BlockQueueStatus_T Push(std::vector<DataQueueItem> data);
-  BlockQueueStatus_T Front(std::vector<DataQueueItem> *data) const;
-  BlockQueueStatus_T Pop();
-  bool Destroy();
+  DataQueueStatus Push(std::vector<DataQueueItem> data) override;
+  DataQueueStatus Front(std::vector<DataQueueItem> *data) const override;
+  DataQueueStatus Pop() override;
+  bool Destroy() override;
+
+  void SetThreadDevice() override;
+  std::shared_ptr<void> AllocHostMem(size_t size) override;

 private:
  struct NodeInfo {
@ -73,6 +80,7 @@ class BACKEND_EXPORT GpuQueue : public DataQueue {
  cudaStream_t stream_;
  std::unique_ptr<NodeInfo[]> node_info_;
  bool ds_detected_{false};
+  uint32_t device_id_;
 };
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_kernel_runtime.cc
@ -21,7 +21,7 @@
 #include "plugin/device/gpu/hal/device/gpu_device_address.h"
 #include "plugin/device/gpu/hal/device/cuda_driver.h"
 #include "plugin/device/gpu/hal/device/gpu_event.h"
-#include "runtime/data_queue/data_queue_mgr.h"
+#include "include/backend/data_queue/data_queue_mgr.h"
 #include "plugin/device/gpu/hal/device/gpu_device_manager.h"
 #include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
 #include "plugin/device/gpu/hal/device/distribution/collective_init.h"
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
@ -26,7 +26,7 @@
 #include "plugin/device/gpu/hal/device/distribution/collective_init.h"
 #include "plugin/device/gpu/hal/device/gpu_device_manager.h"
 #include "plugin/device/gpu/hal/hardware/gpu_somas.h"
-#include "runtime/data_queue/data_queue_mgr.h"
+#include "include/backend/data_queue/data_queue_mgr.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/data/dataset_init_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/data/dataset_init_kernel.cc
@ -17,7 +17,7 @@
 #include "plugin/device/gpu/kernel/data/dataset_init_kernel.h"
 #include "plugin/device/gpu/kernel/data/dataset_utils.h"
 #include "kernel/common_utils.h"
-#include "runtime/data_queue/data_queue_mgr.h"
+#include "include/backend/data_queue/data_queue_mgr.h"
 #include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
 #include "include/common/utils/convert_utils.h"

@ -60,7 +60,7 @@ bool DatasetInitKernelMod::Launch(const std::vector<AddressPtr> &, const std::ve
  }

  auto status = DataQueueMgr::GetInstance().Create(queue_name_, addr, shapes_, buffer_q_capacity_);
-  if (status) {
+  if (status != device::DataQueueStatus::SUCCESS) {
    MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', init Dataset Failed. len: " << len << ", status:" << status;
  }

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/data/dataset_iterator_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/data/dataset_iterator_kernel.cc
@ -27,7 +27,7 @@
 #ifndef ENABLE_SECURITY
 #include "plugin/device/gpu/hal/profiler/gpu_profiling.h"
 #endif
-#include "runtime/data_queue/data_queue_mgr.h"
+#include "include/backend/data_queue/data_queue_mgr.h"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
 #ifdef ENABLE_DUMP_IR
 #include "include/common/debug/rdr/recorder_manager.h"
@ -113,7 +113,7 @@ bool DatasetIteratorKernelMod::ReadDevice(std::vector<DataQueueItem> *data) {
    }
 #endif
    auto ret = DataQueueMgr::GetInstance().Front(queue_name_, data);
-    if (ret == device::SUCCESS) {
+    if (ret == device::DataQueueStatus::SUCCESS) {
 #ifndef ENABLE_SECURITY
      if (profiling_enable_) {
        uint64_t end_time_stamp = profiling_op_->GetTimeStamp();
@ -123,7 +123,7 @@ bool DatasetIteratorKernelMod::ReadDevice(std::vector<DataQueueItem> *data) {
      break;
    }

-    if (ret == device::TIMEOUT) {
+    if (ret == device::DataQueueStatus::TIMEOUT) {
      repeat++;
      if (repeat < 10) {
        MS_LOG(INFO) << "Waiting for data...(" << repeat << " / 10)";
@ -155,7 +155,7 @@ bool DatasetIteratorKernelMod::Launch(const std::vector<AddressPtr> &, const std
  }
  if (!is_opened_) {
    auto ret = DataQueueMgr::GetInstance().OpenDynamicBufQueue(queue_name_);
-    if (ret != device::BlockQueueStatus_T::SUCCESS) {
+    if (ret != device::DataQueueStatus::SUCCESS) {
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', gpu Queue(" << queue_name_ << ") Open Failed: " << ret;
    }
    is_opened_ = true;
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/data/dataset_iterator_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/data/dataset_iterator_kernel.h
@ -24,7 +24,7 @@
 #include "plugin/device/gpu/kernel/data/dataset_profiling.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "runtime/data_queue/blocking_queue.h"
+#include "include/backend/data_queue/blocking_queue.h"
 namespace mindspore {
 namespace kernel {
 using mindspore::device::DataQueueItem;
--- a/mindspore/ccsrc/runtime/data_queue/blocking_queue.cc
+++ b/mindspore/ccsrc/runtime/data_queue/blocking_queue.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include "runtime/data_queue/blocking_queue.h"
+#include "include/backend/data_queue/blocking_queue.h"
 #include "utils/log_adapter.h"

 namespace mindspore {
@ -22,60 +22,60 @@ namespace device {
 const size_t kTimeout = 100;
 void BlockingQueue::RegisterRelease(const std::function<void(void *, int32_t)> &func) { queue_->RegisterRelease(func); }

-BlockQueueStatus_T BlockingQueue::Push(const std::vector<DataQueueItem> &data, unsigned int) {
+DataQueueStatus BlockingQueue::Push(const std::vector<DataQueueItem> &data, unsigned int) {
  std::unique_lock<std::mutex> locker(mutex_);
  if (queue_->IsFull()) {
    if (not_full_cond_.wait_for(locker, std::chrono::microseconds(kTimeout)) == std::cv_status::timeout) {
-      return TIMEOUT;
+      return DataQueueStatus::TIMEOUT;
    }
  }
  auto ret = queue_->Push(data);
-  if (ret != SUCCESS) {
+  if (ret != DataQueueStatus::SUCCESS) {
    return ret;
  }
  not_empty_cond_.notify_one();
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T BlockingQueue::Front(std::vector<DataQueueItem> *data) {
+DataQueueStatus BlockingQueue::Front(std::vector<DataQueueItem> *data) {
  std::unique_lock<std::mutex> locker(mutex_);
  bool timeout = not_empty_cond_.wait_for(locker, std::chrono::seconds(30), [this] { return !queue_->IsEmpty(); });
  if (!timeout) {
-    return TIMEOUT;
+    return DataQueueStatus::TIMEOUT;
  }
  return queue_->Front(data);
 }

-BlockQueueStatus_T BlockingQueue::Pop() {
+DataQueueStatus BlockingQueue::Pop() {
  std::unique_lock<std::mutex> locker(mutex_);
  not_empty_cond_.wait(locker, [this] { return !queue_->IsEmpty(); });
  auto ret = queue_->Pop();
-  if (ret != SUCCESS) {
+  if (ret != DataQueueStatus::SUCCESS) {
    return ret;
  }
  not_full_cond_.notify_one();
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T BlockingQueue::Create(const std::shared_ptr<DataQueue> &data_queue) {
+DataQueueStatus BlockingQueue::Create(const std::shared_ptr<DataQueue> &data_queue) {
  this->queue_ = data_queue;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T BlockingQueue::Clear() {
+DataQueueStatus BlockingQueue::Clear() {
  std::unique_lock<std::mutex> locker(mutex_);
  while (Size() > 0) {
    std::vector<DataQueueItem> data;
    auto ret = queue_->Front(&data);
-    if (ret != SUCCESS) {
+    if (ret != DataQueueStatus::SUCCESS) {
      return ret;
    }
    ret = queue_->Pop();
-    if (ret != SUCCESS) {
+    if (ret != DataQueueStatus::SUCCESS) {
      return ret;
    }
  }
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

 bool BlockingQueue::Destroy() {
--- a/mindspore/ccsrc/runtime/data_queue/data_queue.cc
+++ b/mindspore/ccsrc/runtime/data_queue/data_queue.cc
@ -14,9 +14,11 @@
 * limitations under the License.
 */

-#include "runtime/data_queue/data_queue.h"
+#include "include/backend/data_queue/data_queue.h"
 #include <string>
 #include "utils/ms_context.h"
+#include "runtime/hardware/device_context_manager.h"
+
 namespace mindspore {
 namespace device {
 DataQueue::DataQueue(const size_t capacity)
--- a/mindspore/ccsrc/runtime/data_queue/data_queue_mgr.cc
+++ b/mindspore/ccsrc/runtime/data_queue/data_queue_mgr.cc
@ -14,18 +14,16 @@
 * limitations under the License.
 */

-#include "runtime/data_queue/data_queue_mgr.h"
-#if ENABLE_GPU
-#include "plugin/device/gpu/hal/device/gpu_data_queue.h"
-#elif ENABLE_D
-#include "plugin/device/ascend/hal/device/ascend_data_queue.h"
-#endif
+#include "include/backend/data_queue/data_queue_mgr.h"
 #include <algorithm>
 #include <utility>
 #include "utils/log_adapter.h"
 #include "utils/ms_utils.h"
+#include "utils/ms_context.h"
 #include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
+#include "include/common/utils/anfalgo.h"
+#include "include/backend/data_queue/blocking_queue.h"
+#include "runtime/device/kernel_info.h"

 namespace py = pybind11;

@ -36,136 +34,151 @@ DataQueueMgr &DataQueueMgr::GetInstance() noexcept {
  return instance;
 }

-BlockQueueStatus_T DataQueueMgr::Create(const std::string &channel_name, void *addr, const std::vector<size_t> &shape,
-                                        const size_t &capacity) {
+void DataQueueMgr::RegisterDataQueueCreator(const std::string &device_name, DataQueueCreator &&creator) {
+  data_queue_creator_map_.emplace(device_name, std::forward<DataQueueCreator>(creator));
+}
+
+std::shared_ptr<DataQueue> DataQueueMgr::CreateDataQueue(const std::string &device_name,
+                                                         const std::string &channel_name, bool dynamic_shape,
+                                                         size_t capacity, void *addr,
+                                                         const std::vector<size_t> &shape) {
+  auto iter = data_queue_creator_map_.find(device_name);
+  if (iter == data_queue_creator_map_.end()) {
+    return nullptr;
+  }
+
+  return iter->second(channel_name, dynamic_shape, capacity, addr, shape);
+}
+
+DataQueueStatus DataQueueMgr::Create(const std::string &channel_name, void *addr, const std::vector<size_t> &shape,
+                                     const size_t &capacity) {
  MS_LOG(INFO) << "Static GPU queue: " << channel_name << " created";
  if (name_queue_map_.find(channel_name) != name_queue_map_.end()) {
    MS_LOG(ERROR) << "Queue already exist: " << channel_name;
-    return QUEUE_EXIST;
+    return DataQueueStatus::QUEUE_EXIST;
  }
-#if ENABLE_GPU
-  std::shared_ptr<BlockingQueue> queue = std::make_shared<BlockingQueue>();
-  std::shared_ptr<DataQueue> gpu_queue = std::make_shared<GpuQueue>(addr, shape, capacity);
-  BlockQueueStatus_T rt = queue->Create(gpu_queue);
-  if (rt != SUCCESS) {
-    MS_LOG(ERROR) << "Queue: " << channel_name << "create failed: " << rt;
-    return rt;
+
+  MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+  std::shared_ptr<DataQueue> data_queue = CreateDataQueue(
+    MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET), channel_name, false, capacity, addr, shape);
+  if (data_queue != nullptr) {
+    std::shared_ptr<BlockingQueue> queue = std::make_shared<BlockingQueue>();
+    DataQueueStatus rt = queue->Create(data_queue);
+    if (rt != DataQueueStatus::SUCCESS) {
+      MS_LOG(ERROR) << "Queue: " << channel_name << "create failed: " << rt;
+      return rt;
+    }
+    (void)name_queue_map_.insert(std::make_pair(channel_name, queue));
+    init_ = true;
+    return DataQueueStatus::SUCCESS;
  }
-  (void)name_queue_map_.insert(std::make_pair(channel_name, queue));
-  init_ = true;
-  return SUCCESS;
-#else
+
  MS_LOG(ERROR) << "Static data queue only support GPU target.";
-  return INTERNAL_ERROR;
-#endif
+  return DataQueueStatus::INTERNAL_ERROR;
 }

-BlockQueueStatus_T DataQueueMgr::Open(const std::string &channel_name,
-                                      const std::function<void(void *, int32_t)> func) {
+DataQueueStatus DataQueueMgr::Open(const std::string &channel_name, const std::function<void(void *, int32_t)> func) {
  MS_LOG(INFO) << "Gpu queue: " << channel_name << " open.";
  if (name_queue_map_.find(channel_name) == name_queue_map_.end()) {
    MS_LOG(ERROR) << "Queue not exist " << channel_name;
-    return QUEUE_NOT_EXIST;
+    return DataQueueStatus::QUEUE_NOT_EXIST;
  }

  name_queue_map_[channel_name]->RegisterRelease(func);
  open_by_dataset_++;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }
-BlockQueueStatus_T DataQueueMgr::OpenDynamicBufQueue(const std::string &channel_name,
-                                                     const std::function<void(void *, int32_t)> func) {
+DataQueueStatus DataQueueMgr::OpenDynamicBufQueue(const std::string &channel_name,
+                                                  const std::function<void(void *, int32_t)> func) {
  std::unique_lock<std::mutex> locker(mutex_);
  if (name_queue_map_.find(channel_name) == name_queue_map_.end()) {
-    BlockQueueStatus_T status = CreateDynamicBufQueue(channel_name, default_capacity_);
-    MS_EXCEPTION_IF_CHECK_FAIL(status == BlockQueueStatus_T::SUCCESS, "Create dynamic buffer queue failed");
+    DataQueueStatus status = CreateDynamicBufQueue(channel_name, default_capacity_);
+    MS_EXCEPTION_IF_CHECK_FAIL(status == DataQueueStatus::SUCCESS, "Create dynamic buffer queue failed");
    MS_LOG_INFO << "Create dynamic buffer queue: " << channel_name;
    cv_.notify_all();
  }
  name_queue_map_[channel_name]->RegisterRelease(func);
  open_by_dataset_++;
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T DataQueueMgr::OpenDynamicBufQueue(const std::string &channel_name) {
+DataQueueStatus DataQueueMgr::OpenDynamicBufQueue(const std::string &channel_name) {
  std::unique_lock<std::mutex> locker(mutex_);
  auto time_out = cv_.wait_for(locker, std::chrono::seconds(MAX_WAIT_TIME_IN_SEC),
                               [this, &channel_name] { return name_queue_map_.count(channel_name); });
  if (!time_out) {
-    return TIMEOUT;
+    return DataQueueStatus::TIMEOUT;
  }
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T DataQueueMgr::CreateDynamicBufQueue(const std::string &channel_name, const size_t &capacity) {
+DataQueueStatus DataQueueMgr::CreateDynamicBufQueue(const std::string &channel_name, const size_t &capacity) {
  if (name_queue_map_.find(channel_name) != name_queue_map_.end()) {
    MS_LOG(ERROR) << "Queue already exist: " << channel_name;
-    return QUEUE_EXIST;
+    return DataQueueStatus::QUEUE_EXIST;
  }
-#if ENABLE_GPU
-  std::shared_ptr<BlockingQueue> queue = std::make_shared<BlockingQueue>();
-  std::shared_ptr<DataQueue> device_queue = std::make_shared<GpuDataQueueDynamic>(capacity);
-#elif ENABLE_D
-  std::shared_ptr<BlockingQueue> queue = std::make_shared<BlockingQueue>();
-  std::shared_ptr<DataQueue> device_queue = std::make_shared<AscendDataQueueDynamic>(capacity);
-#else
-  MS_LOG(ERROR) << "Dynamic data queue only support Ascend/GPU target.";
-  return QUEUE_EXIST;
-#endif
-#if ENABLE_GPU || ENABLE_D
-  BlockQueueStatus_T rt = queue->Create(device_queue);
-  if (rt != SUCCESS) {
-    MS_LOG(ERROR) << "Queue: " << channel_name << "create failed: " << rt;
-    return rt;
+  MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+  std::string device_name = MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+  std::shared_ptr<DataQueue> device_queue = CreateDataQueue(device_name, channel_name, true, capacity, nullptr, {});
+  if (device_queue != nullptr && (device_name == kGPUDevice || device_name == kAscendDevice)) {
+    std::shared_ptr<BlockingQueue> queue = std::make_shared<BlockingQueue>();
+    DataQueueStatus rt = queue->Create(device_queue);
+    if (rt != DataQueueStatus::SUCCESS) {
+      MS_LOG(ERROR) << "Queue: " << channel_name << "create failed: " << rt;
+      return rt;
+    }
+    (void)name_queue_map_.insert(std::make_pair(channel_name, queue));
+    init_ = true;
+    return DataQueueStatus::SUCCESS;
  }
-  (void)name_queue_map_.insert(std::make_pair(channel_name, queue));
-  init_ = true;
-  return SUCCESS;
-#endif
+
+  MS_LOG(ERROR) << "Dynamic data queue only support Ascend/GPU target, bug got " << device_name;
+  return DataQueueStatus::QUEUE_EXIST;
 }

-BlockQueueStatus_T DataQueueMgr::Open(const std::string &channel_name) const {
+DataQueueStatus DataQueueMgr::Open(const std::string &channel_name) const {
  if (name_queue_map_.find(channel_name) == name_queue_map_.end()) {
    MS_LOG(ERROR) << "Queue not exist " << channel_name;
-    return QUEUE_NOT_EXIST;
+    return DataQueueStatus::QUEUE_NOT_EXIST;
  }
-  return SUCCESS;
+  return DataQueueStatus::SUCCESS;
 }

-BlockQueueStatus_T DataQueueMgr::Push(const std::string &channel_name, const std::vector<DataQueueItem> &data,
-                                      unsigned int timeout_in_sec) {
+DataQueueStatus DataQueueMgr::Push(const std::string &channel_name, const std::vector<DataQueueItem> &data,
+                                   unsigned int timeout_in_sec) {
  auto iter = name_queue_map_.find(channel_name);
  if (iter == name_queue_map_.end()) {
    MS_LOG(ERROR) << "Queue not exist " << channel_name;
-    return QUEUE_NOT_EXIST;
+    return DataQueueStatus::QUEUE_NOT_EXIST;
  }
  return iter->second->Push(data, timeout_in_sec);
 }

-BlockQueueStatus_T DataQueueMgr::Front(const std::string &channel_name, std::vector<DataQueueItem> *data) {
+DataQueueStatus DataQueueMgr::Front(const std::string &channel_name, std::vector<DataQueueItem> *data) {
  auto iter = name_queue_map_.find(channel_name);
  if (iter == name_queue_map_.end()) {
    MS_LOG(ERROR) << "Queue not exist " << channel_name;
-    return QUEUE_NOT_EXIST;
+    return DataQueueStatus::QUEUE_NOT_EXIST;
  }

  return iter->second->Front(data);
 }

-BlockQueueStatus_T DataQueueMgr::Pop(const std::string &channel_name) {
+DataQueueStatus DataQueueMgr::Pop(const std::string &channel_name) {
  auto iter = name_queue_map_.find(channel_name);
  if (iter == name_queue_map_.end()) {
    MS_LOG(ERROR) << "Queue not exist " << channel_name;
-    return QUEUE_NOT_EXIST;
+    return DataQueueStatus::QUEUE_NOT_EXIST;
  }

  return iter->second->Pop();
 }

-BlockQueueStatus_T DataQueueMgr::Clear(const std::string &channel_name) {
+DataQueueStatus DataQueueMgr::Clear(const std::string &channel_name) {
  auto iter = name_queue_map_.find(channel_name);
  if (iter == name_queue_map_.end()) {
    MS_LOG(ERROR) << "Queue not exist " << channel_name;
-    return QUEUE_NOT_EXIST;
+    return DataQueueStatus::QUEUE_NOT_EXIST;
  }

  return iter->second->Clear();
@ -238,11 +251,39 @@ size_t DataQueueMgr::Capacity(const std::string &channel_name) {
  return name_queue_map_.at(channel_name)->Capacity();
 }

+std::shared_ptr<DataQueue> DataQueueMgr::GetDataQueue(const std::string &channel_name) const {
+  auto iter = name_queue_map_.find(channel_name);
+  if (iter == name_queue_map_.end()) {
+    MS_LOG(ERROR) << "Queue not exist " << channel_name;
+    return nullptr;
+  }
+  MS_EXCEPTION_IF_NULL(iter->second);
+  return iter->second->Queue();
+}
+
+DataQueueStatus DataQueueMgr::SetThreadDevice(const std::string &device_name) {
+  auto tmp_queue = CreateDataQueue(device_name, {}, false, 0, nullptr, {});
+  if (tmp_queue == nullptr) {
+    return DataQueueStatus::QUEUE_NOT_EXIST;
+  }
+  tmp_queue->SetThreadDevice();
+  return DataQueueStatus::SUCCESS;
+}
+
+std::shared_ptr<void> DataQueueMgr::AllocHostMem(const std::string &device_name, size_t size) {
+  auto tmp_queue = CreateDataQueue(device_name, {}, false, 0, nullptr, {});
+  if (tmp_queue == nullptr) {
+    return std::shared_ptr<void>(::malloc(size), ::free);
+  }
+
+  return tmp_queue->AllocHostMem(size);
+}
+#ifndef BUILD_LITE
 bool PopDataFromDataQueue(const AnfNodePtr &data_kernel) {
  auto queue_name = common::AnfAlgo::GetNodeAttr<std::string>(data_kernel, "shared_name");
  device::DataQueueMgr &buf_mgr = device::DataQueueMgr::GetInstance();
  auto ret = buf_mgr.OpenDynamicBufQueue(queue_name);
-  MS_EXCEPTION_IF_CHECK_FAIL(ret == device::BlockQueueStatus_T::SUCCESS, "Open dynamic data queue failed");
+  MS_EXCEPTION_IF_CHECK_FAIL(ret == device::DataQueueStatus::SUCCESS, "Open dynamic data queue failed");
  std::vector<device::DataQueueItem> data;
  auto kernel_info = dynamic_cast<device::KernelInfo *>(data_kernel->kernel_info());
  (void)buf_mgr.Front(queue_name, &data);
@ -271,29 +312,6 @@ bool PopDataFromDataQueue(const AnfNodePtr &data_kernel) {
  common::AnfAlgo::SetOutputInferTypeAndShape(types, shapes, data_kernel.get());
  return true;
 }
-namespace {
-struct DataQueueHandlerRegister {
-  DataQueueHandlerRegister() noexcept {
-    DataQueueHandler::SetOpenHandler(
-      [](const std::string channel_name, const std::function<void(void *, int32_t)> func) -> BlockQueueStatus_T {
-        return DataQueueMgr::GetInstance().Open(channel_name, func);
-      });
-    DataQueueHandler::SetOpenDynamicBufQueueHandler(
-      [](const std::string &channel_name, const std::function<void(void *, int32_t)> func) -> BlockQueueStatus_T {
-        return DataQueueMgr::GetInstance().OpenDynamicBufQueue(channel_name, func);
-      });
-    DataQueueHandler::SetIsClosedHandler([]() -> bool { return DataQueueMgr::GetInstance().IsClosed(); });
-    DataQueueHandler::SetCloseConfirmHandler([]() { DataQueueMgr::GetInstance().CloseConfirm(); });
-    DataQueueHandler::SetCloseHandler([](const std::string &channel) { DataQueueMgr::GetInstance().Close(channel); });
-    DataQueueHandler::SetClearHandler([](const std::string &channel) -> device::BlockQueueStatus_T {
-      return DataQueueMgr::GetInstance().Clear(channel);
-    });
-    DataQueueHandler::SetPushHandler([](const std::string &channel, const std::vector<device::DataQueueItem> &items,
-                                        unsigned int timeout) -> device::BlockQueueStatus_T {
-      return DataQueueMgr::GetInstance().Push(channel, items, timeout);
-    });
-  }
-} callback_register;
-}  // namespace
+#endif
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/data_queue/data_queue_mgr.h
+++ b/mindspore/ccsrc/runtime/data_queue/data_queue_mgr.h
@ -1,131 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_DATA_QUEUE_MGR_H_
-#define MINDSPORE_CCSRC_RUNTIME_DEVICE_DATA_QUEUE_MGR_H_
-
-#include <unistd.h>
-#include <iostream>
-#include <functional>
-#include <map>
-#include <vector>
-#include <string>
-#include <memory>
-#include "runtime/data_queue/blocking_queue.h"
-
-#define EXPORT __attribute__((visibility("default")))
-
-namespace mindspore {
-namespace device {
-static const unsigned int MAX_WAIT_TIME_IN_SEC = 60;
-
-class Semaphore {
- public:
-  explicit Semaphore(int count = 0) : count_(count) {}
-
-  inline void Signal() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    ++count_;
-    cv_.notify_one();
-  }
-
-  inline bool Wait() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    while (count_ == 0) {
-      if (cv_.wait_for(lock, std::chrono::seconds(MAX_WAIT_TIME_IN_SEC)) == std::cv_status::timeout) {
-        return false;
-      }
-    }
-    --count_;
-    return true;
-  }
-
- private:
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  int count_;
-};
-
-class DataQueueMgr {
- public:
-  EXPORT DataQueueMgr() : cur_dev_id_(0), init_(false), closed_(false), open_by_dataset_(0) {}
-
-  EXPORT virtual ~DataQueueMgr() = default;
-
-  EXPORT static DataQueueMgr &GetInstance() noexcept;
-
-  EXPORT BlockQueueStatus_T Create(const std::string &channel_name, void *addr, const std::vector<size_t> &shape,
-                                   const size_t &capacity);
-
-  // call for Push thread
-  EXPORT BlockQueueStatus_T Open(const std::string &channel_name, std::function<void(void *, int32_t)> func);
-
-  // call for Front/Pop thread
-  EXPORT BlockQueueStatus_T Open(const std::string &channel_name) const;
-  EXPORT BlockQueueStatus_T Push(const std::string &channel_name, const std::vector<DataQueueItem> &data,
-                                 unsigned int timeout_in_sec);
-  EXPORT BlockQueueStatus_T Front(const std::string &channel_name, std::vector<DataQueueItem> *data);
-  EXPORT BlockQueueStatus_T Pop(const std::string &channel_name);
-  EXPORT BlockQueueStatus_T Clear(const std::string &channel_name);
-
-  EXPORT BlockQueueStatus_T OpenDynamicBufQueue(const std::string &channel_name,
-                                                const std::function<void(void *, int32_t)> func);
-  EXPORT BlockQueueStatus_T CreateDynamicBufQueue(const std::string &channel_name, const size_t &capacity);
-  EXPORT BlockQueueStatus_T OpenDynamicBufQueue(const std::string &channel_name);
-
-  EXPORT void Close(const std::string &channel_name) const noexcept;
-
-  EXPORT bool IsInit() const;
-
-  EXPORT bool IsClosed() const;
-
-  EXPORT bool Destroy();
-
-  // call for Release GPU Resources
-  EXPORT bool CloseNotify();
-
-  // call for dataset send thread
-  EXPORT void CloseConfirm();
-
-  EXPORT size_t Size(const std::string &channel_name);
-
-  EXPORT size_t Capacity(const std::string &channel_name);
-
- private:
-  int cur_dev_id_;
-  bool init_;
-  bool closed_;
-  std::mutex mutex_;
-  std::mutex close_mutex_;
-  std::condition_variable cv_;
-  // how many queues opened by dataset
-  int open_by_dataset_;
-  Semaphore sema;
-  bool dynamic_shape_{false};
-  size_t default_capacity_{2};
-
-  std::map<std::string, std::shared_ptr<BlockingQueue>> name_queue_map_;
-
-  inline bool isCreated(const std::string &channel_name) const;
-
-  DataQueueMgr(const DataQueueMgr &) = delete;
-  DataQueueMgr &operator=(const DataQueueMgr &) = delete;
-};
-bool PopDataFromDataQueue(const AnfNodePtr &data_kernel);
-}  // namespace device
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_BUFFER_MGR_H_
--- a/mindspore/ccsrc/runtime/device/CMakeLists.txt
+++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt
@ -11,10 +11,5 @@ if("${ENABLE_HIDDEN}" STREQUAL "OFF")
    string(REPLACE " -fvisibility=hidden" " -fvisibility=default" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()

-if(ENABLE_TDTQUE)
-    file(GLOB_RECURSE TDT_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-    "../../minddata/dataset/engine/tdt/tdt_handle.cc")
-endif()
-
 set_property(SOURCE ${DEVICE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
-add_library(_mindspore_runtime_device_obj OBJECT ${DEVICE_SRC_LIST}  ${TDT_SRC_LIST})
+add_library(_mindspore_runtime_device_obj OBJECT ${DEVICE_SRC_LIST})
--- a/mindspore/core/utils/data_queue_handler.cc
+++ b/mindspore/core/utils/data_queue_handler.cc
@ -1,20 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "utils/data_queue_handler.h"
-namespace {
-// empty source file trick to avoid symbol ex/import problem on Windows
-}  // namespace
--- a/mindspore/core/utils/data_queue_handler.h
+++ b/mindspore/core/utils/data_queue_handler.h
@ -1,50 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_INCLUDE_COMMON_UTILS_DATA_QUEUE_HANDLER_H_
-#define MINDSPORE_CCSRC_INCLUDE_COMMON_UTILS_DATA_QUEUE_HANDLER_H_
-
-#include <string>
-#include <utility>
-#include <functional>
-#include <vector>
-#include "utils/macros.h"
-#include "utils/callback_handler.h"
-
-namespace mindspore {
-namespace device {
-enum BlockQueueStatus_T : int { SUCCESS = 0, QUEUE_EXIST, QUEUE_NOT_EXIST, ERROR_INPUT, INTERNAL_ERROR, TIMEOUT };
-struct DataQueueItem {
-  int32_t worker_id_{0};
-  std::string data_type_;
-  size_t data_len_{0};
-  void *data_ptr_{nullptr};
-  std::vector<int64_t> shapes_;
-  void *device_addr_{nullptr};
-};
-}  // namespace device
-class MS_EXPORT DataQueueHandler {
-  HANDLER_DEFINE(device::BlockQueueStatus_T, OpenDynamicBufQueue, const std::string &,
-                 const std::function<void(void *, int32_t)>);
-  HANDLER_DEFINE(device::BlockQueueStatus_T, Open, const std::string &, const std::function<void(void *, int32_t)>);
-  HANDLER_DEFINE(bool, IsClosed);
-  HANDLER_DEFINE(void, CloseConfirm);
-  HANDLER_DEFINE(void, Close, const std::string &);
-  HANDLER_DEFINE(device::BlockQueueStatus_T, Clear, const std::string &);
-  HANDLER_DEFINE(device::BlockQueueStatus_T, Push, const std::string &, const std::vector<device::DataQueueItem> &,
-                 unsigned int);
-};
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_INCLUDE_COMMON_UTILS_DATA_QUEUE_HANDLER_H_
--- a/mindspore/core/utils/log_adapter.cc
+++ b/mindspore/core/utils/log_adapter.cc
@ -39,7 +39,6 @@ constexpr auto kSplitLine = "\n-------------------------------------------------
 #if defined(__ANDROID__) || defined(ANDROID)
 constexpr const char *ANDROID_LOG_TAG = "MS_LITE";
 #endif
-std::map<void **, std::thread *> acl_handle_map;
 // set default log level to WARNING for all sub modules
 int g_ms_submodule_log_levels[NUM_SUBMODUES] = {WARNING};
 #if defined(_WIN32) || defined(_WIN64)
--- a/mindspore/core/utils/log_adapter.h
+++ b/mindspore/core/utils/log_adapter.h
@ -47,7 +47,6 @@ static constexpr size_t GetRelPathPos() noexcept {
 }
 namespace mindspore {
 /// \brief The handler map for ACL.
-MS_CORE_API extern std::map<void **, std::thread *> acl_handle_map;
 #define FILE_NAME                                                                             \
  (sizeof(__FILE__) > GetRelPathPos() ? static_cast<const char *>(__FILE__) + GetRelPathPos() \
                                      : static_cast<const char *>(__FILE__))
--- a/tests/ut/cpp/vm/segment_runner_test.cc
+++ b/tests/ut/cpp/vm/segment_runner_test.cc
@ -53,7 +53,7 @@ TEST_F(TestCompileSegmentRunner, test_MsVmConvert1) {
  std::shared_ptr<mindspore::FuncGraphManager> manager = mindspore::Manage(g);

  BackendPtr b = std::make_shared<Backend>("vm");
-  auto graph_partition = std::make_shared<GraphPartition>(nonlinear_ops, b->name());
+  auto graph_partition = std::make_shared<GraphPartition>(GetNonlinearOps(), b->name());
  auto segments = graph_partition->Partition(g);
  VectorRef args({1.0, 2.0});

@ -67,7 +67,7 @@ TEST_F(TestCompileSegmentRunner, test_MsVmConvert2) {
  std::shared_ptr<mindspore::FuncGraphManager> manager = mindspore::Manage(g);

  BackendPtr b = std::make_shared<Backend>("vm");
-  auto graph_partition = std::make_shared<GraphPartition>(nonlinear_ops, b->name());
+  auto graph_partition = std::make_shared<GraphPartition>(GetNonlinearOps(), b->name());
  auto segments = graph_partition->Partition(g);
  VectorRef args({1.0, 2.0});

@ -81,7 +81,7 @@ TEST_F(TestCompileSegmentRunner, test_if) {
  std::shared_ptr<mindspore::FuncGraphManager> manager = mindspore::Manage(g);

  BackendPtr b = std::make_shared<Backend>("vm");
-  auto graph_partition = std::make_shared<GraphPartition>(nonlinear_ops, b->name());
+  auto graph_partition = std::make_shared<GraphPartition>(GetNonlinearOps(), b->name());
  auto segments = graph_partition->Partition(g);
  VectorRef args({1.0, 2.0});