forked from mindspore-Ecosystem/mindspore
refactor data queue
Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
This commit is contained in:
parent
4da277798e
commit
e0c7fa6136
|
@ -81,7 +81,6 @@ if(ENABLE_D)
|
|||
endif()
|
||||
|
||||
if(ENABLE_GPU)
|
||||
set(ENABLE_GPUQUE ON)
|
||||
add_compile_definitions(ENABLE_GPU_COLLECTIVE)
|
||||
endif()
|
||||
|
||||
|
|
|
@ -545,7 +545,7 @@ MindRTBackend::MindRTBackend(const std::string &backend_name, const std::string
|
|||
root_graph_ = nullptr;
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
const bool pynative_mode = (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode);
|
||||
auto &cut_list = pynative_mode ? compile::control_ops : GetMsNonlinearOps();
|
||||
auto &cut_list = pynative_mode ? GetControlOps() : GetMsNonlinearOps();
|
||||
|
||||
graph_partition_ = std::make_shared<GraphPartition>(cut_list, backend_name);
|
||||
graph_compiler_ = std::make_shared<GraphCompiler>();
|
||||
|
|
|
@ -40,11 +40,17 @@ using PrimTypePair = std::pair<PrimitivePtr, AbstractFunctionPtr>;
|
|||
using MapPrimTypeFuncGraph = std::map<PrimTypePair, FuncGraphPtr>;
|
||||
using TypedPrimitiveAbstractClosurePtr = std::shared_ptr<abstract::TypedPrimitiveAbstractClosure>;
|
||||
|
||||
std::vector<PrimitivePtr> nonlinear_ops = {prim::kPrimReturn, prim::kPrimPartial, prim::kPrimSwitch,
|
||||
prim::kPrimMakeTuple, prim::kPrimBpropCut};
|
||||
const std::vector<PrimitivePtr> &GetNonlinearOps() {
|
||||
static std::vector<PrimitivePtr> nonlinear_ops = {prim::kPrimReturn, prim::kPrimPartial, prim::kPrimSwitch,
|
||||
prim::kPrimMakeTuple, prim::kPrimBpropCut};
|
||||
return nonlinear_ops;
|
||||
}
|
||||
|
||||
std::vector<PrimitivePtr> control_ops = {prim::kPrimReturn, prim::kPrimPartial, prim::kPrimSwitch, prim::kPrimMakeTuple,
|
||||
prim::kPrimSwitchLayer};
|
||||
const std::vector<PrimitivePtr> &GetControlOps() {
|
||||
static std::vector<PrimitivePtr> control_ops = {prim::kPrimReturn, prim::kPrimPartial, prim::kPrimSwitch,
|
||||
prim::kPrimMakeTuple, prim::kPrimSwitchLayer};
|
||||
return control_ops;
|
||||
}
|
||||
|
||||
const std::vector<PrimitivePtr> &GetMsNonlinearOps() {
|
||||
static const std::vector<PrimitivePtr> ms_nonlinear_ops = {prim::kPrimReturn, prim::kPrimPartial,
|
||||
|
|
|
@ -43,8 +43,8 @@ extern const char kGeVm[];
|
|||
// compile namespace
|
||||
// A sub namespace in ME to support compile related definition.
|
||||
namespace compile {
|
||||
BACKEND_EXPORT extern std::vector<PrimitivePtr> nonlinear_ops;
|
||||
BACKEND_EXPORT extern std::vector<PrimitivePtr> control_ops;
|
||||
BACKEND_EXPORT const std::vector<PrimitivePtr> &GetNonlinearOps();
|
||||
BACKEND_EXPORT const std::vector<PrimitivePtr> &GetControlOps();
|
||||
BACKEND_EXPORT const std::vector<PrimitivePtr> &GetMsNonlinearOps();
|
||||
FuncGraphPtr WrapPrimitives(const FuncGraphPtr &graph);
|
||||
using VmEvalFunc = std::function<BaseRef(const VectorRef &)>;
|
||||
|
@ -52,7 +52,7 @@ using VmEvalFuncPtr = std::shared_ptr<std::function<BaseRef(const VectorRef &)>>
|
|||
|
||||
class BACKEND_EXPORT CompileGraph {
|
||||
public:
|
||||
explicit CompileGraph(const BackendPtr &backend, const std::vector<PrimitivePtr> &cut_list = nonlinear_ops);
|
||||
explicit CompileGraph(const BackendPtr &backend, const std::vector<PrimitivePtr> &cut_list = GetNonlinearOps());
|
||||
|
||||
virtual ~CompileGraph() = default;
|
||||
|
||||
|
@ -113,7 +113,7 @@ using CompileGraphPtr = std::shared_ptr<CompileGraph>;
|
|||
// CompileGraphs is used to Convert a graph cluster into instruction lists.
|
||||
class BACKEND_EXPORT CompileGraphs {
|
||||
public:
|
||||
explicit CompileGraphs(const BackendPtr &backend, const std::vector<PrimitivePtr> &cut_list = nonlinear_ops);
|
||||
explicit CompileGraphs(const BackendPtr &backend, const std::vector<PrimitivePtr> &cut_list = GetNonlinearOps());
|
||||
|
||||
virtual ~CompileGraphs() = default;
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2019-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_BLOCKING_QUEUE_H_
|
||||
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_BLOCKING_QUEUE_H_
|
||||
#ifndef MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_BLOCKING_QUEUE_H
|
||||
#define MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_BLOCKING_QUEUE_H
|
||||
|
||||
#include <unistd.h>
|
||||
#include <iostream>
|
||||
|
@ -25,7 +25,7 @@
|
|||
#include <vector>
|
||||
#include <condition_variable>
|
||||
#include <functional>
|
||||
#include "runtime/data_queue/data_queue.h"
|
||||
#include "include/backend/data_queue/data_queue.h"
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
class BlockingQueue {
|
||||
|
@ -33,15 +33,16 @@ class BlockingQueue {
|
|||
BlockingQueue() : queue_(nullptr) {}
|
||||
~BlockingQueue() = default;
|
||||
|
||||
BlockQueueStatus_T Create(const std::shared_ptr<DataQueue> &data_queue);
|
||||
DataQueueStatus Create(const std::shared_ptr<DataQueue> &data_queue);
|
||||
void RegisterRelease(const std::function<void(void *, int32_t)> &func);
|
||||
BlockQueueStatus_T Push(const std::vector<DataQueueItem> &data, unsigned int timeout_in_sec);
|
||||
BlockQueueStatus_T Front(std::vector<DataQueueItem> *data);
|
||||
BlockQueueStatus_T Pop();
|
||||
BlockQueueStatus_T Clear();
|
||||
DataQueueStatus Push(const std::vector<DataQueueItem> &data, unsigned int timeout_in_sec);
|
||||
DataQueueStatus Front(std::vector<DataQueueItem> *data);
|
||||
DataQueueStatus Pop();
|
||||
DataQueueStatus Clear();
|
||||
bool Destroy();
|
||||
size_t Size() { return queue_->Size(); }
|
||||
size_t Capacity() { return queue_->Capacity(); }
|
||||
const std::shared_ptr<DataQueue> &Queue() const { return queue_; }
|
||||
|
||||
private:
|
||||
std::mutex mutex_;
|
||||
|
@ -51,5 +52,4 @@ class BlockingQueue {
|
|||
};
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_BLOCKING_QUEUE_H_
|
||||
#endif // MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_BLOCKING_QUEUE_H
|
|
@ -14,19 +14,30 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_DATA_QUEUE_H_
|
||||
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_DATA_QUEUE_H_
|
||||
#ifndef MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_H
|
||||
#define MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_H
|
||||
|
||||
#include <unistd.h>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include "runtime/hardware/device_context_manager.h"
|
||||
#include "mindspore/core/utils/data_queue_handler.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
class DeviceContext;
|
||||
|
||||
enum class DataQueueStatus : int { SUCCESS = 0, QUEUE_EXIST, QUEUE_NOT_EXIST, ERROR_INPUT, INTERNAL_ERROR, TIMEOUT };
|
||||
|
||||
struct DataQueueItem {
|
||||
int32_t worker_id_{0};
|
||||
std::string data_type_;
|
||||
size_t data_len_{0};
|
||||
void *data_ptr_{nullptr};
|
||||
std::vector<int64_t> shapes_;
|
||||
void *device_addr_{nullptr};
|
||||
};
|
||||
|
||||
class DataQueue {
|
||||
public:
|
||||
explicit DataQueue(const size_t capacity);
|
||||
|
@ -34,16 +45,21 @@ class DataQueue {
|
|||
|
||||
virtual void RegisterRelease(const std::function<void(void *, int32_t)> &func) { host_release_ = func; }
|
||||
|
||||
virtual bool IsOpen() const { return true; }
|
||||
virtual bool IsEmpty() const { return size_ == 0; }
|
||||
virtual bool IsFull() const { return size_ == capacity_; }
|
||||
|
||||
virtual BlockQueueStatus_T Push(std::vector<DataQueueItem> data) = 0;
|
||||
virtual BlockQueueStatus_T Front(std::vector<DataQueueItem> *data) const = 0;
|
||||
virtual BlockQueueStatus_T Pop() = 0;
|
||||
virtual DataQueueStatus Push(std::vector<DataQueueItem> data) = 0;
|
||||
virtual DataQueueStatus Front(std::vector<DataQueueItem> *data) const = 0;
|
||||
virtual DataQueueStatus Pop() = 0;
|
||||
virtual bool Destroy() = 0;
|
||||
virtual void SetThreadDevice() = 0;
|
||||
|
||||
virtual size_t Size() { return size_; }
|
||||
virtual size_t Capacity() { return capacity_; }
|
||||
|
||||
virtual std::shared_ptr<void> AllocHostMem(size_t size) { return std::shared_ptr<void>(::malloc(size), ::free); }
|
||||
|
||||
protected:
|
||||
size_t head_;
|
||||
size_t tail_;
|
||||
|
@ -59,5 +75,4 @@ class DataQueue {
|
|||
};
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_BLOCKING_QUEUE_H_
|
||||
#endif // MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_H
|
|
@ -0,0 +1,156 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_MGR_H
|
||||
#define MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_MGR_H
|
||||
|
||||
#include <unistd.h>
|
||||
#include <iostream>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include "include/backend/visible.h"
|
||||
#include "include/backend/data_queue/data_queue.h"
|
||||
#ifndef BUILD_LITE
|
||||
#include "ir/anf.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
constexpr unsigned int MAX_WAIT_TIME_IN_SEC = 60;
|
||||
|
||||
class BlockingQueue;
|
||||
|
||||
// channel_name, dynamic_shape, capacity, addr, shape
|
||||
using DataQueueCreator =
|
||||
std::function<std::shared_ptr<DataQueue>(const std::string &, bool, size_t, void *, const std::vector<size_t> &)>;
|
||||
|
||||
class Semaphore {
|
||||
public:
|
||||
explicit Semaphore(int count = 0) : count_(count) {}
|
||||
|
||||
inline void Signal() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
++count_;
|
||||
cv_.notify_one();
|
||||
}
|
||||
|
||||
inline bool Wait() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
while (count_ == 0) {
|
||||
if (cv_.wait_for(lock, std::chrono::seconds(MAX_WAIT_TIME_IN_SEC)) == std::cv_status::timeout) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
--count_;
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex mutex_;
|
||||
std::condition_variable cv_;
|
||||
int count_;
|
||||
};
|
||||
|
||||
class BACKEND_EXPORT DataQueueMgr {
|
||||
public:
|
||||
DataQueueMgr() : init_(false), closed_(false), open_by_dataset_(0) {}
|
||||
|
||||
virtual ~DataQueueMgr() = default;
|
||||
|
||||
static DataQueueMgr &GetInstance() noexcept;
|
||||
void RegisterDataQueueCreator(const std::string &device_name, DataQueueCreator &&creator);
|
||||
std::shared_ptr<DataQueue> CreateDataQueue(const std::string &device_name, const std::string &channel_name,
|
||||
bool dynamic_shape, size_t capacity, void *addr,
|
||||
const std::vector<size_t> &shape);
|
||||
|
||||
DataQueueStatus Create(const std::string &channel_name, void *addr, const std::vector<size_t> &shape,
|
||||
const size_t &capacity);
|
||||
|
||||
// call for Push thread
|
||||
DataQueueStatus Open(const std::string &channel_name, std::function<void(void *, int32_t)> func);
|
||||
|
||||
// call for Front/Pop thread
|
||||
DataQueueStatus Open(const std::string &channel_name) const;
|
||||
DataQueueStatus Push(const std::string &channel_name, const std::vector<DataQueueItem> &data,
|
||||
unsigned int timeout_in_sec);
|
||||
DataQueueStatus Front(const std::string &channel_name, std::vector<DataQueueItem> *data);
|
||||
DataQueueStatus Pop(const std::string &channel_name);
|
||||
DataQueueStatus Clear(const std::string &channel_name);
|
||||
|
||||
DataQueueStatus OpenDynamicBufQueue(const std::string &channel_name, const std::function<void(void *, int32_t)> func);
|
||||
DataQueueStatus CreateDynamicBufQueue(const std::string &channel_name, const size_t &capacity);
|
||||
DataQueueStatus OpenDynamicBufQueue(const std::string &channel_name);
|
||||
std::shared_ptr<DataQueue> GetDataQueue(const std::string &channel_name) const;
|
||||
DataQueueStatus SetThreadDevice(const std::string &device_name);
|
||||
std::shared_ptr<void> AllocHostMem(const std::string &device_name, size_t size);
|
||||
|
||||
void Close(const std::string &channel_name) const noexcept;
|
||||
|
||||
bool IsInit() const;
|
||||
|
||||
bool IsClosed() const;
|
||||
|
||||
bool Destroy();
|
||||
|
||||
// call for Release GPU Resources
|
||||
bool CloseNotify();
|
||||
|
||||
// call for dataset send thread
|
||||
void CloseConfirm();
|
||||
|
||||
size_t Size(const std::string &channel_name);
|
||||
|
||||
size_t Capacity(const std::string &channel_name);
|
||||
|
||||
private:
|
||||
inline bool isCreated(const std::string &channel_name) const;
|
||||
|
||||
DataQueueMgr(const DataQueueMgr &) = delete;
|
||||
DataQueueMgr &operator=(const DataQueueMgr &) = delete;
|
||||
|
||||
bool init_;
|
||||
bool closed_;
|
||||
std::mutex mutex_;
|
||||
std::mutex close_mutex_;
|
||||
std::condition_variable cv_;
|
||||
// how many queues opened by dataset
|
||||
int open_by_dataset_;
|
||||
Semaphore sema;
|
||||
bool dynamic_shape_{false};
|
||||
size_t default_capacity_{2};
|
||||
|
||||
std::map<std::string, std::shared_ptr<BlockingQueue>> name_queue_map_;
|
||||
|
||||
std::map<std::string, DataQueueCreator> data_queue_creator_map_ = {}; // key: device name, value: DataQueueCreator
|
||||
};
|
||||
#ifndef BUILD_LITE
|
||||
bool PopDataFromDataQueue(const AnfNodePtr &data_kernel);
|
||||
#endif
|
||||
#define REGISTER_DATA_QUEUE_CREATOR(device_name, creator) \
|
||||
struct device_name##DataQueueCreatorClass { \
|
||||
device_name##DataQueueCreatorClass() { \
|
||||
DataQueueMgr::GetInstance().RegisterDataQueueCreator(device_name, creator); \
|
||||
} \
|
||||
} g_##device_name##_data_queue_creator;
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_INCLUDE_BACKEND_DATA_QUEUE_DATA_QUEUE_MGR_H
|
|
@ -24,10 +24,6 @@ if(ENABLE_ACL)
|
|||
add_definitions(-D ENABLE_ACL)
|
||||
message(STATUS "ACL module is enabled")
|
||||
endif()
|
||||
if(ENABLE_GPUQUE)
|
||||
add_definitions(-D ENABLE_GPUQUE)
|
||||
message(STATUS "GPU queue is enabled")
|
||||
endif()
|
||||
if(ENABLE_TDTQUE)
|
||||
add_definitions(-D ENABLE_TDTQUE)
|
||||
message(STATUS "TDT queue is enabled")
|
||||
|
@ -119,9 +115,6 @@ endif()
|
|||
if(NOT ENABLE_SECURITY)
|
||||
add_dependencies(engine-perf core)
|
||||
endif()
|
||||
if(ENABLE_TDTQUE)
|
||||
add_dependencies(engine-device-queue core)
|
||||
endif()
|
||||
|
||||
# for proto/cache_grpc.pb.h dependency
|
||||
if(ENABLE_CACHE)
|
||||
|
@ -204,11 +197,6 @@ if(ENABLE_TDTQUE)
|
|||
if(NOT ENABLE_SECURITY)
|
||||
set(dataengine_submodules ${dataengine_submodules} $<TARGET_OBJECTS:engine-perf>)
|
||||
endif()
|
||||
set(dataengine_submodules ${dataengine_submodules} $<TARGET_OBJECTS:engine-device-queue>)
|
||||
# tdt impl
|
||||
set(dataengine_submodules ${dataengine_submodules} $<TARGET_OBJECTS:engine-tdt>)
|
||||
# host queue impl
|
||||
set(dataengine_submodules ${dataengine_submodules} $<TARGET_OBJECTS:engine-host-queue>)
|
||||
else()
|
||||
if(NOT ENABLE_SECURITY)
|
||||
set(dataengine_submodules ${dataengine_submodules} $<TARGET_OBJECTS:engine-perf>)
|
||||
|
@ -251,16 +239,7 @@ target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::turb
|
|||
mindspore::opencv_imgcodecs mindspore::opencv_imgproc mindspore::tinyxml2
|
||||
mindspore::sentencepiece_train ${ICU_LIB})
|
||||
|
||||
if(ENABLE_GPUQUE)
|
||||
target_link_libraries(_c_dataengine PRIVATE
|
||||
${CUDNN_LIBRARY_PATH}
|
||||
${CUDA_PATH}/lib64/libcudart.so
|
||||
${CUDA_PATH}/lib64/stubs/libcuda.so)
|
||||
endif()
|
||||
|
||||
if(ENABLE_TDTQUE)
|
||||
target_link_libraries(_c_dataengine PRIVATE ${ACL} ${ACL_TDT_CHANNEL})
|
||||
endif()
|
||||
target_link_libraries(_c_dataengine PRIVATE mindspore_backend)
|
||||
|
||||
add_dependencies(_c_dataengine _c_mindrecord)
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
|
||||
|
|
|
@ -10,10 +10,6 @@ endif()
|
|||
|
||||
add_subdirectory(cache)
|
||||
|
||||
if(ENABLE_TDTQUE)
|
||||
add_subdirectory(device_queue_impl)
|
||||
endif()
|
||||
|
||||
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||
set(SRC_FILES_LIST
|
||||
|
@ -67,7 +63,3 @@ if(NOT ENABLE_SECURITY)
|
|||
add_dependencies(engine-perf engine-cache-client)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(ENABLE_TDTQUE)
|
||||
add_dependencies(engine engine-device-queue)
|
||||
endif()
|
||||
|
|
|
@ -35,7 +35,9 @@
|
|||
#include "minddata/mindrecord/include/shard_header.h"
|
||||
#include "minddata/mindrecord/include/shard_writer.h"
|
||||
#endif
|
||||
|
||||
#ifdef WITH_BACKEND
|
||||
#include "utils/ms_context.h"
|
||||
#endif
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
#ifndef ENABLE_SECURITY
|
||||
|
@ -355,18 +357,19 @@ Status TreeConsumer::Reset(int64_t step) {
|
|||
#endif
|
||||
RETURN_IF_NOT_OK(this->Terminate());
|
||||
}
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
// clear the device if GPU is used.
|
||||
std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(root != nullptr, "Root is a nullptr.");
|
||||
DeviceQueueOp *op = dynamic_cast<DeviceQueueOp *>(root.get());
|
||||
if (op != nullptr) {
|
||||
MS_LOG(INFO) << "Clearing the GPU device";
|
||||
RETURN_IF_NOT_OK(op->ClearDevice());
|
||||
#ifdef WITH_BACKEND
|
||||
MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
|
||||
if (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
|
||||
// clear the device if GPU is used.
|
||||
std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(root != nullptr, "Root is a nullptr.");
|
||||
DeviceQueueOp *op = dynamic_cast<DeviceQueueOp *>(root.get());
|
||||
if (op != nullptr) {
|
||||
MS_LOG(INFO) << "Clearing the GPU device";
|
||||
RETURN_IF_NOT_OK(op->ClearDevice());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
tree_adapter_ = std::make_unique<TreeAdapter>(TreeAdapter::UsageFlag::kDeReset);
|
||||
RETURN_IF_NOT_OK(tree_adapter_->Compile(old_root, num_epochs_, step));
|
||||
RETURN_IF_NOT_OK(tree_adapter_->Launch());
|
||||
|
|
|
@ -21,12 +21,33 @@
|
|||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "minddata/dataset/engine/gpu_item_connector.h"
|
||||
#include "minddata/dataset/engine/dataset_iterator.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/util/task_manager.h"
|
||||
|
||||
#ifdef WITH_BACKEND
|
||||
#include "mindspore/ccsrc/include/backend/data_queue/data_queue_mgr.h"
|
||||
#endif
|
||||
#include "mindspore/ccsrc/ps/ps_cache/ps_data/ps_data_prefetch.h"
|
||||
#ifdef WITH_BACKEND
|
||||
#include "utils/ms_context.h"
|
||||
#endif
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace {
|
||||
std::vector<DataQueueItem> ConvertTensorRowToDataQueueItem(const TensorRow &row) {
|
||||
std::vector<device::DataQueueItem> items;
|
||||
for (auto &i : row) {
|
||||
device::DataQueueItem data_item;
|
||||
data_item.data_len_ = static_cast<size_t>(i->SizeInBytes());
|
||||
data_item.shapes_ = i->shape().AsVector();
|
||||
data_item.data_ptr_ = const_cast<void *>(static_cast<const void *>(i->GetBuffer()));
|
||||
data_item.data_type_ = i->type().ToString();
|
||||
items.emplace_back(std::move(data_item));
|
||||
}
|
||||
return items;
|
||||
}
|
||||
} // namespace
|
||||
DeviceQueueOp::DeviceQueueOp(const std::string channel_name, DeviceType device_type, int32_t device_id,
|
||||
bool send_epoch_end, int32_t total_batch, bool create_data_info_queue)
|
||||
: PipelineOp(1),
|
||||
|
@ -40,37 +61,23 @@ DeviceQueueOp::DeviceQueueOp(const std::string channel_name, DeviceType device_t
|
|||
create_data_info_queue_(create_data_info_queue),
|
||||
data_info_queue_ptr_(nullptr),
|
||||
first_fetch_flag_(false),
|
||||
first_push_flag_(false)
|
||||
#ifdef ENABLE_TDTQUE
|
||||
,
|
||||
ascend_keep_waiting_(true),
|
||||
tdtInstancePtr(std::make_shared<TdtPlugin>(channel_name_, device_id_))
|
||||
#endif
|
||||
{
|
||||
first_push_flag_(false),
|
||||
ascend_keep_waiting_(true) {
|
||||
std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
|
||||
dynamic_shape_ = cfg->dynamic_shape();
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
// Get the total device num of current machine
|
||||
int32_t device_count = 0;
|
||||
cudaGetDeviceCount(&device_count);
|
||||
rank_id_ = cfg->rank_id(); // Get the current rank_id
|
||||
if (device_count > 0) {
|
||||
rank_id_ = rank_id_ % device_count;
|
||||
}
|
||||
// Be careful when try to modified these num_workers_ and queue_capacity_,
|
||||
// and we suggest num_workers_ * queue_capacity_ not greater than 16, because
|
||||
// one worker one circular_pool with 1G pin memory, so num_workers_ * queue_capacity_
|
||||
// must limit to avoid memory overload
|
||||
num_workers_ = kDeviceQueGpuNumThreads;
|
||||
queue_capacity_ = kDeviceQueGpuQueueCapacity;
|
||||
#endif
|
||||
#ifdef ENABLE_TDTQUE
|
||||
ascend_keep_waiting_ = true;
|
||||
if (kIsHeterogeneous) {
|
||||
tdtInstancePtr = std::make_shared<HostQueueImpl>(channel_name_, device_id_);
|
||||
} else {
|
||||
tdtInstancePtr = std::make_shared<TdtPlugin>(channel_name_, device_id_);
|
||||
#ifdef WITH_BACKEND
|
||||
MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
|
||||
if (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice && !dynamic_shape_) {
|
||||
ascend_data_queue_ =
|
||||
device::DataQueueMgr::GetInstance().CreateDataQueue(kAscendDevice, channel_name, dynamic_shape_, 0, nullptr, {});
|
||||
}
|
||||
#endif
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
|
@ -87,13 +94,11 @@ DeviceQueueOp::~DeviceQueueOp() {
|
|||
#endif
|
||||
}
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
void DeviceQueueOp::ReleaseData(void *addr, int32_t worker_id) {
|
||||
if (addr != nullptr && worker_id >= 0 && worker_id < pool_.size()) {
|
||||
pool_[worker_id]->Deallocate(addr);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
Status DeviceQueueOp::EoeReceived(int32_t worker_id) {
|
||||
state_ = OpState::kDeOpIdle;
|
||||
|
@ -150,7 +155,7 @@ Status DeviceQueueOp::operator()() {
|
|||
}
|
||||
#endif
|
||||
if (device_type_ == DeviceType::Ascend) {
|
||||
#ifdef ENABLE_TDTQUE
|
||||
#ifdef WITH_BACKEND
|
||||
if (create_data_info_queue_) {
|
||||
// This place has a race condition with GetDataInfo, so the first one
|
||||
// arrive here will do the initialize work.
|
||||
|
@ -162,10 +167,6 @@ Status DeviceQueueOp::operator()() {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (!kIsHeterogeneous && tdtInstancePtr->acl_handle_ == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED(
|
||||
"[Internal ERROR] Create channel for sending data failed, please check DEVICE ID setting.");
|
||||
}
|
||||
|
||||
if (dynamic_shape_) {
|
||||
RETURN_IF_NOT_OK(SendDataToAscendDynamic());
|
||||
|
@ -174,7 +175,7 @@ Status DeviceQueueOp::operator()() {
|
|||
}
|
||||
#endif
|
||||
} else if (device_type_ == DeviceType::GPU) {
|
||||
#ifdef ENABLE_GPUQUE
|
||||
#ifdef WITH_BACKEND
|
||||
RETURN_IF_NOT_OK(SendDataToGPU());
|
||||
#endif
|
||||
} else if (device_type_ == DeviceType::CPU) {
|
||||
|
@ -184,7 +185,6 @@ Status DeviceQueueOp::operator()() {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
#ifdef ENABLE_TDTQUE
|
||||
Status DeviceQueueOp::SendDataToAscend() {
|
||||
MS_LOG(INFO) << "Device queue, sending data to Ascend.";
|
||||
#ifndef ENABLE_SECURITY
|
||||
|
@ -242,7 +242,7 @@ Status DeviceQueueOp::SendDataToAscend() {
|
|||
#endif
|
||||
PrintBeginInfoWhenFirstBatch(first_push_flag_);
|
||||
// when training stopped, handle might have been destroyed immediately
|
||||
if (!kIsHeterogeneous && tdtInstancePtr->acl_handle_ == nullptr) {
|
||||
if (ascend_data_queue_ != nullptr && !ascend_data_queue_->IsOpen()) {
|
||||
MS_LOG(WARNING) << "Thread has already been terminated.";
|
||||
is_break_loop = true;
|
||||
continue;
|
||||
|
@ -306,9 +306,22 @@ Status DeviceQueueOp::SendEpochEndToAscend(const TensorRow &curr_row, const bool
|
|||
int32_t *tdt_cost, bool *is_break_loop) {
|
||||
RETURN_UNEXPECTED_IF_NULL(tdt_cost);
|
||||
RETURN_UNEXPECTED_IF_NULL(is_break_loop);
|
||||
if (curr_row.eoe() && send_epoch_end_ && tdtInstancePtr->acl_handle_ != nullptr) {
|
||||
if (curr_row.eoe() && send_epoch_end_ && ascend_data_queue_->IsOpen()) {
|
||||
TensorRow dummy_row;
|
||||
auto status = tdtInstancePtr->hostPush(dummy_row, is_profiling_enable, tdt_cost, ACL_TENSOR_DATA_END_OF_SEQUENCE);
|
||||
#ifndef ENABLE_SECURITY
|
||||
double start_time = 0;
|
||||
if (is_profiling_enable) {
|
||||
start_time = ProfilingTime::GetCurMilliSecond();
|
||||
}
|
||||
#endif
|
||||
auto status = ascend_data_queue_->Push({});
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (is_profiling_enable) {
|
||||
double end_time = ProfilingTime::GetCurMilliSecond();
|
||||
// compile error occurring when MS_EXCEPTION_IF_NULL
|
||||
*tdt_cost = static_cast<int32_t>(end_time - start_time);
|
||||
}
|
||||
#endif
|
||||
|
||||
RETURN_IF_NOT_OK(CheckPushStatus(status, stop_send_, &send_finished_, is_break_loop));
|
||||
MS_LOG(INFO) << "an epoch has already sent, now stop send data.";
|
||||
|
@ -337,8 +350,22 @@ void DeviceQueueOp::LimitSendingBatches(int64_t send_batch, int64_t *sending_num
|
|||
}
|
||||
|
||||
Status DeviceQueueOp::SendRowToTdt(TensorRow curr_row, bool is_profiling_enable, int32_t *tdt_cost) {
|
||||
auto status = tdtInstancePtr->hostPush(curr_row, is_profiling_enable, tdt_cost);
|
||||
if (status != Status::OK()) {
|
||||
std::vector<device::DataQueueItem> items = ConvertTensorRowToDataQueueItem(curr_row);
|
||||
#ifndef ENABLE_SECURITY
|
||||
double start_time = 0;
|
||||
if (is_profiling_enable) {
|
||||
start_time = ProfilingTime::GetCurMilliSecond();
|
||||
}
|
||||
#endif
|
||||
auto status = ascend_data_queue_->Push(items);
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (is_profiling_enable) {
|
||||
double end_time = ProfilingTime::GetCurMilliSecond();
|
||||
// compile error occurring when MS_EXCEPTION_IF_NULL
|
||||
*tdt_cost = static_cast<int32_t>(end_time - start_time);
|
||||
}
|
||||
#endif
|
||||
if (status != device::DataQueueStatus::SUCCESS) {
|
||||
if (stop_send_) {
|
||||
MS_LOG(INFO) << "stop_send received";
|
||||
return Status::OK();
|
||||
|
@ -358,15 +385,16 @@ Status DeviceQueueOp::SendRowToTdt(TensorRow curr_row, bool is_profiling_enable,
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DeviceQueueOp::CheckPushStatus(Status status, bool stop_send, bool *send_finished, bool *is_break_loop) {
|
||||
if (status != Status::OK()) {
|
||||
Status DeviceQueueOp::CheckPushStatus(DataQueueStatus status, bool stop_send, bool *send_finished,
|
||||
bool *is_break_loop) {
|
||||
if (status != DataQueueStatus::SUCCESS) {
|
||||
if (stop_send) {
|
||||
*send_finished = true;
|
||||
MS_LOG(INFO) << "stop_send received";
|
||||
return Status::OK();
|
||||
}
|
||||
// when training stopped, handle might have been destroyed immediately
|
||||
if (tdtInstancePtr->acl_handle_ == nullptr) {
|
||||
if (!ascend_data_queue_->IsOpen()) {
|
||||
*is_break_loop = true;
|
||||
MS_LOG(WARNING) << "Thread has already been terminated.";
|
||||
return Status::OK();
|
||||
|
@ -379,10 +407,14 @@ Status DeviceQueueOp::CheckPushStatus(Status status, bool stop_send, bool *send_
|
|||
}
|
||||
return Status::OK();
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_TDTQUE
|
||||
Status DeviceQueueOp::GetDataInfo(DATA_INFO *data_info) {
|
||||
#ifdef WITH_BACKEND
|
||||
MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
|
||||
if (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kAscendDevice) {
|
||||
RETURN_STATUS_UNEXPECTED("'GetDataInfo' only supported on Ascend.");
|
||||
}
|
||||
|
||||
if (!create_data_info_queue_) {
|
||||
RETURN_STATUS_UNEXPECTED("[Internal ERROR] DataInfo queue is not created.");
|
||||
}
|
||||
|
@ -396,38 +428,29 @@ Status DeviceQueueOp::GetDataInfo(DATA_INFO *data_info) {
|
|||
}
|
||||
}
|
||||
RETURN_IF_NOT_OK(data_info_queue_ptr_->PopFront(data_info));
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
#else
|
||||
Status DeviceQueueOp::GetDataInfo(DATA_INFO *data_info) {
|
||||
RETURN_STATUS_UNEXPECTED("'GetDataInfo' only supported on Ascend.");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
Status DeviceQueueOp::SetThreadDevice() {
|
||||
// Without cudaSetDevice cuda memory will allocate on GPU:0 as default
|
||||
// and will overload in distribute scenario.
|
||||
auto ret = cudaSetDevice(rank_id_);
|
||||
if (ret != cudaSuccess) {
|
||||
std::string err;
|
||||
err += "cudaSetDevice failed, ret[";
|
||||
err += std::to_string(static_cast<int>(ret));
|
||||
err += "], ";
|
||||
err += cudaGetErrorString(ret);
|
||||
RETURN_STATUS_UNEXPECTED(err);
|
||||
}
|
||||
#ifdef WITH_BACKEND
|
||||
(void)device::DataQueueMgr::GetInstance().SetThreadDevice("GPU");
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DeviceQueueOp::LaunchParallelCopyThread() {
|
||||
#ifdef WITH_BACKEND
|
||||
RETURN_UNEXPECTED_IF_NULL(tree_);
|
||||
// Every thread use cuda api should SetThreadDevice
|
||||
RETURN_IF_NOT_OK(SetThreadDevice());
|
||||
// CircularPool may not safe under multi-threads scenario, so one worker with one pool
|
||||
for (int i = 0; i < num_workers_; i++) {
|
||||
std::shared_ptr<MemoryPool> pool;
|
||||
RETURN_IF_NOT_OK(CircularPool::CreateCircularPool(&pool, -1, kDeviceQueGpuThreadMemory, false, true));
|
||||
MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
|
||||
RETURN_IF_NOT_OK(CircularPool::CreateCircularPool(
|
||||
&pool, -1, kDeviceQueGpuThreadMemory, false,
|
||||
MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice));
|
||||
pool_.push_back(pool);
|
||||
}
|
||||
gpu_connector_ = std::make_unique<GpuConnector>(num_workers_, 1, queue_capacity_);
|
||||
|
@ -437,15 +460,20 @@ Status DeviceQueueOp::LaunchParallelCopyThread() {
|
|||
tree_->LaunchWorkers(num_workers_, std::bind(&DeviceQueueOp::WorkerEntry, this, std::placeholders::_1), "", id()));
|
||||
RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("Push data to GPU queue",
|
||||
std::bind(&DeviceQueueOp::PushDataToGPU, this), nullptr, id()));
|
||||
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool DeviceQueueOp::NoExceptionRaised() {
|
||||
return !TaskManager::FindMe()->Interrupted() && !mindspore::DataQueueHandler::IsClosed();
|
||||
#ifdef WITH_BACKEND
|
||||
return !TaskManager::FindMe()->Interrupted() && !device::DataQueueMgr::GetInstance().IsClosed();
|
||||
#else
|
||||
return !TaskManager::FindMe()->Interrupted();
|
||||
#endif
|
||||
}
|
||||
|
||||
Status DeviceQueueOp::PushDataToGPU() {
|
||||
#ifdef WITH_BACKEND
|
||||
RETURN_UNEXPECTED_IF_NULL(tree_);
|
||||
// Every thread use cuda api should SetThreadDevice
|
||||
RETURN_IF_NOT_OK(SetThreadDevice());
|
||||
|
@ -473,17 +501,17 @@ Status DeviceQueueOp::PushDataToGPU() {
|
|||
bool eoe_flag = item.eoe_flag;
|
||||
int64_t send_batch = 0;
|
||||
auto release_function = std::bind(&DeviceQueueOp::ReleaseData, this, std::placeholders::_1, std::placeholders::_2);
|
||||
BlockQueueStatus_T ret;
|
||||
DataQueueStatus ret;
|
||||
if (dynamic_shape_) {
|
||||
ret = mindspore::DataQueueHandler::OpenDynamicBufQueue(channel_name_, release_function);
|
||||
ret = device::DataQueueMgr::GetInstance().OpenDynamicBufQueue(channel_name_, release_function);
|
||||
} else {
|
||||
ret = mindspore::DataQueueHandler::Open(channel_name_, release_function);
|
||||
ret = device::DataQueueMgr::GetInstance().Open(channel_name_, release_function);
|
||||
}
|
||||
if (ret != BlockQueueStatus_T::SUCCESS) {
|
||||
if (ret != DataQueueStatus::SUCCESS) {
|
||||
RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to open channel for sending data.");
|
||||
}
|
||||
|
||||
while (!(items.empty() && !eoe_flag) && !mindspore::DataQueueHandler::IsClosed()) {
|
||||
while (!(items.empty() && !eoe_flag) && !device::DataQueueMgr::GetInstance().IsClosed()) {
|
||||
if (!eoe_flag) {
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
md_channel_info_->RecordBatchQueue(gpu_connector_->size());
|
||||
|
@ -524,8 +552,8 @@ Status DeviceQueueOp::PushDataToGPU() {
|
|||
eoe_flag = item.eoe_flag;
|
||||
// If the batches send by dataset are more than gpu calculate, gpu will core for no signal notify.
|
||||
if (rc.IsError()) {
|
||||
mindspore::DataQueueHandler::Close(channel_name_);
|
||||
mindspore::DataQueueHandler::CloseConfirm();
|
||||
device::DataQueueMgr::GetInstance().Close(channel_name_);
|
||||
device::DataQueueMgr::GetInstance().CloseConfirm();
|
||||
return rc;
|
||||
}
|
||||
} else {
|
||||
|
@ -540,8 +568,8 @@ Status DeviceQueueOp::PushDataToGPU() {
|
|||
tree_->SetFinished();
|
||||
MS_LOG(INFO) << "ExecutionTree finished. Device queue pushed number of batches: " << send_batch;
|
||||
|
||||
mindspore::DataQueueHandler::Close(channel_name_);
|
||||
mindspore::DataQueueHandler::CloseConfirm();
|
||||
device::DataQueueMgr::GetInstance().Close(channel_name_);
|
||||
device::DataQueueMgr::GetInstance().CloseConfirm();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -553,7 +581,7 @@ Status DeviceQueueOp::WorkerEntry(int32_t worker_id) {
|
|||
TensorRow current_row;
|
||||
uint32_t batch_num = 0;
|
||||
RETURN_IF_NOT_OK(receive_queues_[worker_id]->PopFront(¤t_row));
|
||||
while (!current_row.quit() && !mindspore::DataQueueHandler::IsClosed()) {
|
||||
while (!current_row.quit() && !device::DataQueueMgr::GetInstance().IsClosed()) {
|
||||
GpuConnectorItem connector_item = {{}, current_row.eoe()};
|
||||
if (!connector_item.eoe_flag) {
|
||||
std::vector<device::DataQueueItem> items;
|
||||
|
@ -565,6 +593,7 @@ Status DeviceQueueOp::WorkerEntry(int32_t worker_id) {
|
|||
data_item.worker_id_ = worker_id;
|
||||
items.push_back(data_item);
|
||||
}
|
||||
|
||||
RETURN_IF_NOT_OK(MallocForGPUData(&items, current_row, worker_id));
|
||||
connector_item.data_item = std::move(items);
|
||||
batch_num++;
|
||||
|
@ -579,10 +608,12 @@ Status DeviceQueueOp::WorkerEntry(int32_t worker_id) {
|
|||
// Add empty data_item vector with eoe_flag=false as quit flag.
|
||||
GpuConnectorItem connector_item = {{}, false};
|
||||
RETURN_IF_NOT_OK(gpu_connector_->Add(worker_id, std::move(connector_item)));
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DeviceQueueOp::SendDataToGPU() {
|
||||
#ifdef WITH_BACKEND
|
||||
RETURN_IF_NOT_OK(LaunchParallelCopyThread());
|
||||
MS_LOG(INFO) << "Device queue, sending data to GPU.";
|
||||
#ifndef ENABLE_SECURITY
|
||||
|
@ -594,10 +625,8 @@ Status DeviceQueueOp::SendDataToGPU() {
|
|||
first_fetch_flag_ = true;
|
||||
int64_t num_buf = 0;
|
||||
bool is_break_loop = false;
|
||||
uint32_t batch_num = 0;
|
||||
while (!current_row.eof() && !is_break_loop && !mindspore::DataQueueHandler::IsClosed()) {
|
||||
while (!current_row.eoe() && !is_break_loop && !mindspore::DataQueueHandler::IsClosed()) {
|
||||
batch_num++;
|
||||
while (!current_row.eof() && !is_break_loop && !device::DataQueueMgr::GetInstance().IsClosed()) {
|
||||
while (!current_row.eoe() && !is_break_loop && !device::DataQueueMgr::GetInstance().IsClosed()) {
|
||||
RETURN_IF_NOT_OK(FilterMetadata(¤t_row));
|
||||
RETURN_IF_NOT_OK(CheckExceptions(current_row));
|
||||
#ifndef ENABLE_SECURITY
|
||||
|
@ -636,6 +665,7 @@ Status DeviceQueueOp::SendDataToGPU() {
|
|||
}
|
||||
|
||||
MS_LOG(INFO) << "Device queue received number of batches and EOEs: " << (num_buf - num_workers_);
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -664,23 +694,23 @@ Status DeviceQueueOp::MallocForGPUData(std::vector<device::DataQueueItem> *items
|
|||
}
|
||||
|
||||
Status DeviceQueueOp::ClearDevice() {
|
||||
#ifdef WITH_BACKEND
|
||||
MS_LOG(INFO) << "Clearing the data in GPU device: " << device_id_ << " channel: " << channel_name_;
|
||||
auto release_function = std::bind(&DeviceQueueOp::ReleaseData, this, std::placeholders::_1, std::placeholders::_2);
|
||||
auto ret = mindspore::DataQueueHandler::Open(channel_name_, release_function);
|
||||
if (ret != BlockQueueStatus_T::SUCCESS) {
|
||||
auto ret = device::DataQueueMgr::GetInstance().Open(channel_name_, release_function);
|
||||
if (ret != DataQueueStatus::SUCCESS) {
|
||||
RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to open channel for clearing the device.");
|
||||
}
|
||||
|
||||
ret = mindspore::DataQueueHandler::Clear(channel_name_);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(!ret, "Failed to clear the device.");
|
||||
ret = device::DataQueueMgr::GetInstance().Clear(channel_name_);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(ret == DataQueueStatus::SUCCESS, "Failed to clear the device.");
|
||||
|
||||
mindspore::DataQueueHandler::Close(channel_name_);
|
||||
mindspore::DataQueueHandler::CloseConfirm();
|
||||
device::DataQueueMgr::GetInstance().Close(channel_name_);
|
||||
device::DataQueueMgr::GetInstance().CloseConfirm();
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
Status DeviceQueueOp::SendDataToCPU() {
|
||||
MS_LOG(INFO) << "Device queue, sending data to CPU.";
|
||||
int64_t total_batch = 0;
|
||||
|
@ -794,6 +824,7 @@ void DeviceQueueOp::PrintEndInfoWhenFirstBatch(bool *first_push_flag) {
|
|||
}
|
||||
Status DeviceQueueOp::RetryPushData(const std::vector<DataQueueItem> &items, const bool profiling,
|
||||
uint64_t *push_time) {
|
||||
#ifdef WITH_BACKEND
|
||||
bool flag_log = false;
|
||||
#ifndef ENABLE_SECURITY
|
||||
uint64_t start_time = 0;
|
||||
|
@ -801,10 +832,10 @@ Status DeviceQueueOp::RetryPushData(const std::vector<DataQueueItem> &items, con
|
|||
start_time = ProfilingTime::GetCurMilliSecond();
|
||||
}
|
||||
#endif
|
||||
while (!mindspore::DataQueueHandler::IsClosed() && !TaskManager::FindMe()->Interrupted()) {
|
||||
BlockQueueStatus_T ret = mindspore::DataQueueHandler::Push(channel_name_, items, WAIT_TIME);
|
||||
if (ret != BlockQueueStatus_T::SUCCESS) {
|
||||
if (ret == BlockQueueStatus_T::ERROR_INPUT) {
|
||||
while (!device::DataQueueMgr::GetInstance().IsClosed() && !TaskManager::FindMe()->Interrupted()) {
|
||||
DataQueueStatus ret = device::DataQueueMgr::GetInstance().Push(channel_name_, items, WAIT_TIME);
|
||||
if (ret != DataQueueStatus::SUCCESS) {
|
||||
if (ret == DataQueueStatus::ERROR_INPUT) {
|
||||
return Status(
|
||||
StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
|
||||
"Invalid data, the types or shapes of current row is different with previous row(i.e. do batch operation but "
|
||||
|
@ -827,11 +858,13 @@ Status DeviceQueueOp::RetryPushData(const std::vector<DataQueueItem> &items, con
|
|||
if (profiling) {
|
||||
*push_time = ProfilingTime::GetCurMilliSecond() - start_time;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DeviceQueueOp::SendDataToAscendDynamic() {
|
||||
#ifdef WITH_BACKEND
|
||||
MS_LOG(DEBUG) << "Dynamic Device queue, sending data to Ascend.";
|
||||
|
||||
int64_t send_batch = 0;
|
||||
|
@ -840,8 +873,8 @@ Status DeviceQueueOp::SendDataToAscendDynamic() {
|
|||
bool is_break_loop = false;
|
||||
|
||||
std::function<void(void *, int32_t)> release_function([](void *, int32_t) { return; });
|
||||
auto ret = mindspore::DataQueueHandler::OpenDynamicBufQueue(channel_name_, release_function);
|
||||
if (ret != BlockQueueStatus_T::SUCCESS) {
|
||||
auto ret = device::DataQueueMgr::GetInstance().OpenDynamicBufQueue(channel_name_, release_function);
|
||||
if (ret != DataQueueStatus::SUCCESS) {
|
||||
return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
|
||||
"[Internal ERROR] Failed to open channel for sending data.");
|
||||
}
|
||||
|
@ -854,16 +887,7 @@ Status DeviceQueueOp::SendDataToAscendDynamic() {
|
|||
while (!curr_row.eoe() && !is_break_loop) {
|
||||
RETURN_IF_NOT_OK(FilterMetadata(&curr_row));
|
||||
RETURN_IF_NOT_OK(CheckExceptions(curr_row));
|
||||
std::vector<device::DataQueueItem> items;
|
||||
for (auto &i : curr_row) {
|
||||
device::DataQueueItem data_item;
|
||||
data_item.data_len_ = static_cast<size_t>(i->SizeInBytes());
|
||||
data_item.shapes_ = i->shape().AsVector();
|
||||
data_item.data_ptr_ = const_cast<void *>(static_cast<const void *>(i->GetBuffer()));
|
||||
data_item.data_type_ = i->type().ToString();
|
||||
items.push_back(data_item);
|
||||
}
|
||||
|
||||
std::vector<device::DataQueueItem> items = ConvertTensorRowToDataQueueItem(curr_row);
|
||||
RETURN_IF_NOT_OK(RetryPushData(items, false, &data_queue_cost));
|
||||
if (create_data_info_queue_) {
|
||||
DATA_INFO data_info;
|
||||
|
@ -895,9 +919,9 @@ Status DeviceQueueOp::SendDataToAscendDynamic() {
|
|||
tree_->SetFinished();
|
||||
MS_LOG(INFO) << "ExecutionTree finished. Device queue sent number of batches: " << send_batch;
|
||||
|
||||
mindspore::DataQueueHandler::Close(channel_name_);
|
||||
mindspore::DataQueueHandler::CloseConfirm();
|
||||
|
||||
device::DataQueueMgr::GetInstance().Close(channel_name_);
|
||||
device::DataQueueMgr::GetInstance().CloseConfirm();
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
|
|
|
@ -27,30 +27,20 @@
|
|||
|
||||
#include "minddata/dataset/engine/perf/device_queue_tracing.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "mindspore/core/utils/data_queue_handler.h"
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
#include "minddata/dataset/util/rdr.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_TDTQUE
|
||||
#include "minddata/dataset/util/queue.h"
|
||||
#include "minddata/dataset/engine/device_queue_impl/device_queue_base.h"
|
||||
#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_plugin.h"
|
||||
#include "minddata/dataset/engine/device_queue_impl/host_queue/host_queue_plugin.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
#include "minddata/dataset/engine/gpu_item_connector.h"
|
||||
#include "minddata/dataset/util/circular_pool.h"
|
||||
#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
|
||||
#endif
|
||||
#include "mindspore/ccsrc/include/backend/data_queue/data_queue.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
class GpuConnector;
|
||||
using DATA_INFO = std::vector<std::pair<DataType, TensorShape>>;
|
||||
using DATA_INFO_QUEUE = Queue<DATA_INFO>;
|
||||
using mindspore::device::BlockQueueStatus_T;
|
||||
using mindspore::device::DataQueueItem;
|
||||
using mindspore::device::DataQueueStatus;
|
||||
constexpr int32_t kTimeOutMilliSeconds = 25000;
|
||||
const int kDataInfoQueueCapacity = 128;
|
||||
|
||||
|
@ -83,13 +73,9 @@ class DeviceQueueOp : public PipelineOp {
|
|||
stop_send_ = false;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_TDTQUE
|
||||
void StopWaiting() { ascend_keep_waiting_ = false; }
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
Status ClearDevice();
|
||||
#endif
|
||||
|
||||
Status GetDataInfo(DATA_INFO *data_info);
|
||||
|
||||
|
@ -136,7 +122,6 @@ class DeviceQueueOp : public PipelineOp {
|
|||
bool NoExceptionRaised();
|
||||
Status SendDataToAscendDynamic();
|
||||
|
||||
#ifdef ENABLE_TDTQUE
|
||||
void WaitContinueSignal() const;
|
||||
Status SendDataToAscend();
|
||||
Status SendEpochEndToAscend(const TensorRow &curr_row, const bool &is_profiling_enable, int32_t *tdt_cost,
|
||||
|
@ -144,11 +129,9 @@ class DeviceQueueOp : public PipelineOp {
|
|||
void LimitSendingBatches(int64_t send_batch, int64_t *sending_num, std::shared_ptr<ConfigManager> cfg);
|
||||
Status SendRowToTdt(TensorRow curr_row, bool is_profiling_enable, int32_t *tdt_cost);
|
||||
// check status that push data into device
|
||||
Status CheckPushStatus(Status status, bool stop_send, bool *send_finished, bool *is_break_loop);
|
||||
Status CheckPushStatus(DataQueueStatus status, bool stop_send, bool *send_finished, bool *is_break_loop);
|
||||
bool ascend_keep_waiting_;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
Status SendDataToGPU();
|
||||
Status MallocForGPUData(std::vector<device::DataQueueItem> *items, const TensorRow &curr_row,
|
||||
const int32_t &worker_id);
|
||||
|
@ -166,11 +149,6 @@ class DeviceQueueOp : public PipelineOp {
|
|||
const uint32_t kDeviceQueGpuThreadMemory = 1024;
|
||||
uint32_t num_workers_;
|
||||
uint32_t queue_capacity_;
|
||||
// This rank_id is for device_queue, one process work with only one rank_id,
|
||||
// for standalone scenario, this rank_id may come from env 'CUDA_VISIBLE_DEVICES',
|
||||
// but for distribute scenario, this rank_id come from _get_global_rank() in python
|
||||
uint32_t rank_id_;
|
||||
#endif
|
||||
|
||||
Status SendDataToCPU();
|
||||
#ifndef ENABLE_SECURITY
|
||||
|
@ -195,10 +173,8 @@ class DeviceQueueOp : public PipelineOp {
|
|||
std::mutex data_info_mutex_;
|
||||
bool first_push_flag_; // default: false, when first push, it will be true
|
||||
bool dynamic_shape_{false};
|
||||
std::shared_ptr<device::DataQueue> ascend_data_queue_;
|
||||
|
||||
#ifdef ENABLE_TDTQUE
|
||||
std::shared_ptr<DeviceQueueBase> tdtInstancePtr;
|
||||
#endif
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
std::shared_ptr<MDChannelInfo> md_channel_info_;
|
||||
#endif
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||
if(ENABLE_TDTQUE)
|
||||
add_subdirectory(tdt)
|
||||
add_subdirectory(host_queue)
|
||||
endif()
|
||||
add_library(engine-device-queue OBJECT device_queue_base.cc)
|
||||
if(ENABLE_TDTQUE)
|
||||
add_dependencies(engine-device-queue engine-tdt)
|
||||
add_dependencies(engine-device-queue engine-host-queue)
|
||||
endif()
|
|
@ -1,95 +0,0 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minddata/dataset/engine/device_queue_impl/device_queue_base.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "minddata/dataset/engine/perf/profiling.h"
|
||||
#endif
|
||||
#include "minddata/dataset/util/log_adapter.h"
|
||||
#if ENABLE_D
|
||||
#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
constexpr auto kUnknownErrorString = "Unknown error occurred";
|
||||
extern const bool kIsHeterogeneous = []() noexcept -> bool {
|
||||
int32_t is_heterogeneous = 0;
|
||||
(void)rtGetIsHeterogenous(&is_heterogeneous);
|
||||
return is_heterogeneous == 1;
|
||||
}();
|
||||
|
||||
DeviceQueueBase::DeviceQueueBase(const std::string &channel_name, int32_t device_id) {
|
||||
// init ErrorManager, 0 means success
|
||||
if (ErrorManager::GetInstance().Init() != 0) {
|
||||
MS_LOG(WARNING) << "[Internal Error] Init ErrorManager failed.";
|
||||
}
|
||||
channel_name_ = channel_name;
|
||||
device_id_ = device_id;
|
||||
}
|
||||
|
||||
Status DeviceQueueBase::GetAclDataType(DataType d_type, aclDataType *datatype) {
|
||||
switch (d_type.value()) {
|
||||
case DataType::DE_BOOL:
|
||||
*datatype = ACL_BOOL;
|
||||
break;
|
||||
case DataType::DE_INT8:
|
||||
*datatype = ACL_INT8;
|
||||
break;
|
||||
case DataType::DE_UINT8:
|
||||
*datatype = ACL_UINT8;
|
||||
break;
|
||||
case DataType::DE_INT16:
|
||||
*datatype = ACL_INT16;
|
||||
break;
|
||||
case DataType::DE_UINT16:
|
||||
*datatype = ACL_UINT16;
|
||||
break;
|
||||
case DataType::DE_INT32:
|
||||
*datatype = ACL_INT32;
|
||||
break;
|
||||
case DataType::DE_UINT32:
|
||||
*datatype = ACL_UINT32;
|
||||
break;
|
||||
case DataType::DE_FLOAT16:
|
||||
*datatype = ACL_FLOAT16;
|
||||
break;
|
||||
case DataType::DE_FLOAT32:
|
||||
*datatype = ACL_FLOAT;
|
||||
break;
|
||||
case DataType::DE_FLOAT64:
|
||||
*datatype = ACL_DOUBLE;
|
||||
break;
|
||||
case DataType::DE_INT64:
|
||||
*datatype = ACL_INT64;
|
||||
break;
|
||||
case DataType::DE_UINT64:
|
||||
*datatype = ACL_UINT64;
|
||||
break;
|
||||
default:
|
||||
RETURN_STATUS_UNEXPECTED("Invalid data, got unexpected data type.");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void DeviceQueueBase::ReportErrorMessage() {
|
||||
const std::string &error_message = ErrorManager::GetInstance().GetErrorMessage();
|
||||
if (!error_message.empty() && error_message.find(kUnknownErrorString) == std::string::npos) {
|
||||
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
|
||||
}
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -1,65 +0,0 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_DEVICE_QUEUE_IMPL_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_DEVICE_QUEUE_IMPL_H_
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
#include "acl/acl_tdt.h"
|
||||
#include "runtime/rt_mem_queue.h"
|
||||
#include "runtime/dev.h"
|
||||
#include "runtime/config.h"
|
||||
#include "graph/def_types.h"
|
||||
#include "common/util/error_manager/error_manager.h"
|
||||
|
||||
#include "minddata/dataset/core/data_type.h"
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/core/tensor_row.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
extern const bool kIsHeterogeneous;
|
||||
|
||||
class DeviceQueueBase {
|
||||
public:
|
||||
virtual Status hostPush(TensorRow ts_row, bool profiling, int32_t *time,
|
||||
acltdtTensorType tdt_type = ACL_TENSOR_DATA_TENSOR) = 0;
|
||||
|
||||
DeviceQueueBase(const std::string &channel_name, int32_t device_id);
|
||||
|
||||
virtual ~DeviceQueueBase() {}
|
||||
|
||||
acltdtChannelHandle *acl_handle_;
|
||||
|
||||
protected:
|
||||
Status GetAclDataType(DataType d_type, aclDataType *datatype);
|
||||
|
||||
void ReportErrorMessage();
|
||||
|
||||
std::string channel_name_;
|
||||
|
||||
int32_t device_id_ = 0;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_DEVICE_QUEUE_IMPL_H_
|
|
@ -1,3 +0,0 @@
|
|||
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||
add_library(engine-host-queue OBJECT host_queue_plugin.cc)
|
|
@ -1,300 +0,0 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minddata/dataset/engine/device_queue_impl/host_queue/host_queue_plugin.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "minddata/dataset/engine/perf/profiling.h"
|
||||
#endif
|
||||
#include "minddata/dataset/util/log_adapter.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
constexpr auto kMbufHeadEndOfSequencePos = 128U;
|
||||
constexpr auto kEndOfSequenceFlag = 0x5A;
|
||||
constexpr auto kTransIdOffset = 64UL;
|
||||
constexpr auto kSleepMilliSeconds = 500;
|
||||
const std::map<acltdtTensorType, int32_t> type_map = {
|
||||
{ACL_TENSOR_DATA_TENSOR, 0}, {ACL_TENSOR_DATA_END_OF_SEQUENCE, 1}, {ACL_TENSOR_DATA_ABNORMAL, 2}};
|
||||
|
||||
HostQueueImpl::HostQueueImpl(const std::string &channel_name, int32_t device_id)
|
||||
: DeviceQueueBase(channel_name, device_id) {
|
||||
auto status = HostQueueInit();
|
||||
if (status != Status::OK()) {
|
||||
MS_LOG(WARNING) << "Host queue init failed.";
|
||||
}
|
||||
}
|
||||
|
||||
Status HostQueueImpl::hostPush(TensorRow ts_row, bool profiling, int32_t *time, acltdtTensorType tdt_type) {
|
||||
#ifndef ENABLE_SECURITY
|
||||
double start_time;
|
||||
if (profiling) {
|
||||
start_time = ProfilingTime::GetCurMilliSecond();
|
||||
}
|
||||
#endif
|
||||
// send data by datagw
|
||||
auto status = SendDataByHostQueue(ts_row, tdt_type);
|
||||
if (status != Status::OK()) {
|
||||
RETURN_STATUS_UNEXPECTED("Host queue send data failed.");
|
||||
}
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (profiling) {
|
||||
double end_time = ProfilingTime::GetCurMilliSecond();
|
||||
*time = static_cast<int32_t>(end_time - start_time);
|
||||
}
|
||||
#endif
|
||||
return status;
|
||||
}
|
||||
|
||||
Status HostQueueImpl::HostQueueInit() {
|
||||
auto ret = rtSetDevice(device_id_);
|
||||
if (ret != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtSetDevice failed, ret =" << ret;
|
||||
RETURN_STATUS_UNEXPECTED("Init:rtSetDevice failed.");
|
||||
}
|
||||
|
||||
ret = rtMemQueueInit(device_id_);
|
||||
if (ret != ACL_RT_SUCCESS && ret != ACL_ERROR_RT_REPEATED_INIT) {
|
||||
MS_LOG(ERROR) << "call rtMemQueueInit failed, ret =" << ret;
|
||||
RETURN_STATUS_UNEXPECTED("Init:rtMemQueueInit failed.");
|
||||
}
|
||||
|
||||
rtMemQueueAttr_t attr = {};
|
||||
auto mem_ret = memset_s(attr.name, RT_MQ_MAX_NAME_LEN, 0, RT_MQ_MAX_NAME_LEN);
|
||||
if (mem_ret != EOK) {
|
||||
MS_LOG(ERROR) << "call memset_s failed, ret =" << mem_ret;
|
||||
RETURN_STATUS_UNEXPECTED("Init:memset_s failed.");
|
||||
}
|
||||
mem_ret = memcpy_s(attr.name, RT_MQ_MAX_NAME_LEN, channel_name_.c_str(), channel_name_.size() + 1);
|
||||
if (mem_ret != EOK) {
|
||||
MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
|
||||
RETURN_STATUS_UNEXPECTED("Init:memcpy_s failed.");
|
||||
}
|
||||
|
||||
attr.depth = 128U;
|
||||
attr.workMode = RT_MQ_MODE_DEFAULT;
|
||||
attr.flowCtrlFlag = false;
|
||||
attr.flowCtrlDropTime = 0U;
|
||||
attr.overWriteFlag = false;
|
||||
ret = rtMemQueueCreate(device_id_, &attr, &queue_id_);
|
||||
if (ret != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtMemQueueCreate failed, ret =" << ret;
|
||||
RETURN_STATUS_UNEXPECTED("Init:rtMemQueueCreate failed.");
|
||||
}
|
||||
|
||||
rtMemBuffCfg_t buff_cfg = {0};
|
||||
ret = rtMbufInit(&buff_cfg);
|
||||
if (ret != ACL_RT_SUCCESS && ret != ACL_ERROR_RT_REPEATED_INIT) {
|
||||
MS_LOG(ERROR) << "call rtMbufInit failed, ret =" << ret;
|
||||
RETURN_STATUS_UNEXPECTED("Init:rtMbufInit failed.");
|
||||
}
|
||||
const std::lock_guard<std::mutex> lk(queue_id_to_trans_id_map_mutex);
|
||||
(void)queue_id_to_trans_id_map.insert({queue_id_, 0UL});
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status HostQueueImpl::AddDataItemInfo(const acltdtTensorType &tdt_data_type, const int32_t &tensor_type,
|
||||
const int64_t *dims, const size_t &dim_size, void *data_ptr,
|
||||
const uint64_t &data_len, std::vector<DataItemInfo> *items) {
|
||||
DataItemInfo item = {};
|
||||
int32_t data_type = 0;
|
||||
auto it = type_map.find(tdt_data_type);
|
||||
if (it != type_map.end()) {
|
||||
data_type = it->second;
|
||||
}
|
||||
item.ctrl_info.data_type = data_type;
|
||||
item.ctrl_info.tensor_type = tensor_type;
|
||||
item.ctrl_info.dim_num = dim_size;
|
||||
item.ctrl_info.data_len = data_len;
|
||||
item.dims.clear();
|
||||
for (size_t i = 0; i < dim_size; ++i) {
|
||||
item.dims.push_back(dims[i]);
|
||||
}
|
||||
item.data_ptr = data_ptr;
|
||||
items->push_back(item);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status HostQueueImpl::CreateDataItemInfos(const acltdtTensorType &acl_type, const TensorRow &ts_row,
|
||||
std::vector<DataItemInfo> *items) {
|
||||
if (acl_type != ACL_TENSOR_DATA_TENSOR) {
|
||||
return AddDataItemInfo(acl_type, ACL_BOOL, nullptr, 0UL, nullptr, 0UL, items);
|
||||
}
|
||||
|
||||
for (auto &ts : ts_row) {
|
||||
aclDataType data_type;
|
||||
RETURN_IF_NOT_OK(GetAclDataType(ts->type(), &data_type));
|
||||
TensorShape tsShape = ts->shape();
|
||||
std::shared_ptr<void> dataPtr =
|
||||
std::shared_ptr<void>(reinterpret_cast<uchar *>(&(*ts->begin<uint8_t>())), [](const void *elem) {});
|
||||
size_t dataLen = ts->SizeInBytes();
|
||||
const dsize_t dims = tsShape.Rank();
|
||||
std::vector<int64_t> dataShape;
|
||||
for (auto i = 0; i < dims; i++) {
|
||||
dataShape.emplace_back(tsShape[i]);
|
||||
}
|
||||
if (ts->type() != DataType::DE_STRING) {
|
||||
RETURN_IF_NOT_OK(AddDataItemInfo(ACL_TENSOR_DATA_TENSOR, data_type, (tsShape.empty() ? nullptr : &dataShape[0]),
|
||||
dims, dataPtr.get(), dataLen, items));
|
||||
} else {
|
||||
RETURN_STATUS_UNEXPECTED("Create data item failed when send data with type:" + std::to_string(data_type));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status HostQueueImpl::SerializeDataItemInfos(std::vector<DataItemInfo> *items, void **buff,
|
||||
const acltdtTensorType &acl_type) {
|
||||
size_t cnt = items->size();
|
||||
size_t total_size = 0UL;
|
||||
for (size_t i = 0UL; i < cnt; ++i) {
|
||||
(*items)[i].ctrl_info.cur_cnt = i;
|
||||
(*items)[i].ctrl_info.cnt = cnt;
|
||||
total_size += sizeof(ItemInfo) + (*items)[i].ctrl_info.dim_num * sizeof(int64_t) + (*items)[i].ctrl_info.data_len;
|
||||
}
|
||||
|
||||
auto rt_error = rtMbufAlloc(buff, total_size);
|
||||
if (rt_error != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtMbufAlloc with size[" << total_size << "] failed, ret = " << rt_error;
|
||||
RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo: rtMbufAlloc failed.");
|
||||
}
|
||||
|
||||
void *data = nullptr;
|
||||
rt_error = rtMbufGetBuffAddr(*buff, &data);
|
||||
if (rt_error != ACL_RT_SUCCESS) {
|
||||
(void)rtMbufFree(*buff);
|
||||
MS_LOG(ERROR) << "call rtMbufGetBuffAddr with size[" << total_size << "] failed, ret = " << rt_error;
|
||||
RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo:rtMbufGetBuffAddr failed.");
|
||||
}
|
||||
|
||||
void *head_buf = nullptr;
|
||||
uint64_t head_size = 0UL;
|
||||
rt_error = rtMbufGetPrivInfo(*buff, &head_buf, &head_size);
|
||||
if (rt_error != ACL_RT_SUCCESS) {
|
||||
(void)rtMbufFree(*buff);
|
||||
MS_LOG(ERROR) << "call rtMbufGetPrivInfo failed, ret =" << rt_error;
|
||||
RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo:rtMbufGetPrivInfo failed.");
|
||||
}
|
||||
if ((head_buf != nullptr) && (head_size > kMbufHeadEndOfSequencePos)) {
|
||||
MS_LOG(DEBUG) << "host queue set end_of_sequence mbuf head.";
|
||||
}
|
||||
|
||||
size_t offset = 0UL;
|
||||
for (size_t i = 0UL; i < cnt; ++i) {
|
||||
auto mem_ret = memcpy_s(ge::ValueToPtr(ge::PtrToValue(data) + offset), sizeof(ItemInfo), &((*items)[i].ctrl_info),
|
||||
sizeof(ItemInfo));
|
||||
if (mem_ret != EOK) {
|
||||
(void)rtMbufFree(*buff);
|
||||
MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
|
||||
RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo:memcpy_s failed.");
|
||||
}
|
||||
offset += sizeof(ItemInfo);
|
||||
|
||||
for (size_t j = 0UL; j < (*items)[i].ctrl_info.dim_num; ++j) {
|
||||
mem_ret = memcpy_s(ge::ValueToPtr(ge::PtrToValue(data) + offset), sizeof(int64_t), &((*items)[i].dims[j]),
|
||||
sizeof(int64_t));
|
||||
if (mem_ret != EOK) {
|
||||
(void)rtMbufFree(*buff);
|
||||
MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
|
||||
RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo:memcpy_s failed.");
|
||||
}
|
||||
offset += sizeof(int64_t);
|
||||
}
|
||||
|
||||
if ((*items)[i].ctrl_info.data_len == 0UL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
mem_ret = memcpy_s(ge::ValueToPtr(ge::PtrToValue(data) + offset), (*items)[i].ctrl_info.data_len,
|
||||
(*items)[i].data_ptr, (*items)[i].ctrl_info.data_len);
|
||||
if (mem_ret != EOK) {
|
||||
(void)rtMbufFree(*buff);
|
||||
MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
|
||||
RETURN_STATUS_UNEXPECTED("SerializeDataItemInfo:memcpy_s failed.");
|
||||
}
|
||||
offset += (*items)[i].ctrl_info.data_len;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status HostQueueImpl::LaunchTensor2MBuff(const acltdtTensorType &acl_type, const TensorRow &tensor_row, void **buff) {
|
||||
std::vector<DataItemInfo> items;
|
||||
RETURN_IF_NOT_OK(CreateDataItemInfos(acl_type, tensor_row, &items));
|
||||
RETURN_IF_NOT_OK(SerializeDataItemInfos(&items, buff, acl_type));
|
||||
RETURN_IF_NOT_OK(SetTransId4MBuf(buff));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status HostQueueImpl::SetTransId4MBuf(void **buff) {
|
||||
void *head_buff = nullptr;
|
||||
uint64_t head_size = 0UL;
|
||||
auto ret = rtMbufGetPrivInfo(*buff, &head_buff, &head_size);
|
||||
if (ret != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtMbufGetPrivInfo failed, ret =" << ret;
|
||||
RETURN_STATUS_UNEXPECTED("HostQueueSetTransId:rtMbufGetPrivInfo failed.");
|
||||
}
|
||||
uint64_t *trans_id = reinterpret_cast<uint64_t *>(static_cast<uint8_t *>(head_buff) + head_size - kTransIdOffset);
|
||||
const std::lock_guard<std::mutex> lk(queue_id_to_trans_id_map_mutex);
|
||||
*trans_id = ++queue_id_to_trans_id_map[queue_id_];
|
||||
MS_LOG(DEBUG) << "host queue[" << queue_id_ << "] set trans id[" << *trans_id << "] success";
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status HostQueueImpl::EnqueueData(void *buff, bool *need_resend) {
|
||||
*need_resend = false;
|
||||
auto rt_error = rtSetDevice(device_id_);
|
||||
if (rt_error != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtSetDevice device failed, ret=" << rt_error;
|
||||
RETURN_STATUS_UNEXPECTED("rtSetDevice failed.");
|
||||
}
|
||||
rt_error = rtMemQueueEnQueue(device_id_, queue_id_, buff);
|
||||
if (rt_error == RT_ERROR_NONE) {
|
||||
return Status::OK();
|
||||
} else if (rt_error == ACL_ERROR_RT_QUEUE_FULL) {
|
||||
*need_resend = true;
|
||||
MS_LOG(DEBUG) << "queue[" << queue_id_ << "] is full, need call rtMemQueueEnQueue again";
|
||||
} else {
|
||||
HostQueueFreeBuff(buff);
|
||||
MS_LOG(ERROR) << "host enqueue queue[" << queue_id_ << "] failed, ret = " << rt_error;
|
||||
RETURN_STATUS_UNEXPECTED("host enqueue queue failed.");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status HostQueueImpl::SendDataByHostQueue(const TensorRow &tensor_row, const acltdtTensorType &data_type) {
|
||||
Status status;
|
||||
bool is_need_resend = false;
|
||||
void *buff = nullptr;
|
||||
// Status status;
|
||||
RETURN_IF_NOT_OK(LaunchTensor2MBuff(data_type, tensor_row, &buff));
|
||||
do {
|
||||
if (is_need_resend) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(kSleepMilliSeconds));
|
||||
}
|
||||
status = EnqueueData(buff, &is_need_resend);
|
||||
} while (status == Status::OK() && is_need_resend);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void HostQueueImpl::HostQueueFreeBuff(void *buff) {
|
||||
auto rt_error = rtMbufFree(buff);
|
||||
if (rt_error != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtMbufFree failed, ret=" << rt_error;
|
||||
}
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -1,91 +0,0 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_HOST_QUEUE_HOST_QUEUE_PLUGIN_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_HOST_QUEUE_HOST_QUEUE_PLUGIN_H_
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
#include "acl/acl_tdt.h"
|
||||
#include "runtime/rt_mem_queue.h"
|
||||
#include "runtime/dev.h"
|
||||
#include "runtime/config.h"
|
||||
#include "graph/def_types.h"
|
||||
#include "common/util/error_manager/error_manager.h"
|
||||
#include "minddata/dataset/engine/device_queue_impl/device_queue_base.h"
|
||||
|
||||
#include "minddata/dataset/core/data_type.h"
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/core/tensor_row.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
struct ItemInfo {
|
||||
int32_t version;
|
||||
int32_t data_type;
|
||||
uint32_t cur_cnt;
|
||||
uint32_t cnt;
|
||||
int32_t tensor_type;
|
||||
uint32_t dim_num;
|
||||
char reserved[32];
|
||||
uint64_t data_len;
|
||||
};
|
||||
|
||||
struct DataItemInfo {
|
||||
ItemInfo ctrl_info;
|
||||
std::vector<int64_t> dims;
|
||||
void *data_ptr;
|
||||
};
|
||||
|
||||
class HostQueueImpl : public DeviceQueueBase {
|
||||
public:
|
||||
Status hostPush(TensorRow ts_row, bool profiling, int32_t *time, acltdtTensorType tdt_type = ACL_TENSOR_DATA_TENSOR);
|
||||
|
||||
HostQueueImpl(const std::string &channel_name, int32_t device_id);
|
||||
|
||||
~HostQueueImpl() {}
|
||||
|
||||
private:
|
||||
Status HostQueueInit();
|
||||
|
||||
Status SendDataByHostQueue(const TensorRow &tensor_row, const acltdtTensorType &data_type);
|
||||
|
||||
Status SetTransId4MBuf(void **buff);
|
||||
|
||||
Status LaunchTensor2MBuff(const acltdtTensorType &acl_type, const TensorRow &tensor_row, void **buff);
|
||||
Status EnqueueData(void *buff, bool *need_resend);
|
||||
|
||||
Status CreateDataItemInfos(const acltdtTensorType &acl_type, const TensorRow &ts_row,
|
||||
std::vector<DataItemInfo> *items);
|
||||
Status SerializeDataItemInfos(std::vector<DataItemInfo> *items, void **buff, const acltdtTensorType &acl_type);
|
||||
Status AddDataItemInfo(const acltdtTensorType &tdt_data_type, const int32_t &tensor_type, const int64_t *dims,
|
||||
const size_t &dim_size, void *data_ptr, const uint64_t &data_len,
|
||||
std::vector<DataItemInfo> *items);
|
||||
void HostQueueFreeBuff(void *buff);
|
||||
|
||||
std::mutex queue_id_to_trans_id_map_mutex;
|
||||
std::map<uint32_t, uint64_t> queue_id_to_trans_id_map;
|
||||
uint32_t queue_id_ = 0;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_HOST_QUEUE_HOST_QUEUE_PLUGIN_H_
|
|
@ -1,3 +0,0 @@
|
|||
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||
add_library(engine-tdt OBJECT tdt_plugin.cc tdt_handle.cc)
|
|
@ -1,20 +0,0 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h"
|
||||
|
||||
// Implementation code for TdtHandle moved from 'tdt_handle.cc' to 'tdt_handle.h';
|
||||
// This is a workaround for cyclic dependency problem between dataset and core
|
||||
// when ENABLE_TDTQUE is enabled.
|
|
@ -1,84 +0,0 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_HANDLE_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_HANDLE_H_
|
||||
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
#include "minddata/dataset/util/log_adapter.h"
|
||||
#include "acl/acl_tdt.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
class TdtHandle {
|
||||
public:
|
||||
static inline void AddHandle(acltdtChannelHandle **handle, std::thread *use_thread);
|
||||
|
||||
static inline bool DestroyHandle();
|
||||
|
||||
static inline void DelHandle(acltdtChannelHandle **handle);
|
||||
|
||||
private:
|
||||
TdtHandle() {}
|
||||
~TdtHandle() = default;
|
||||
};
|
||||
|
||||
inline void TdtHandle::AddHandle(acltdtChannelHandle **handle, std::thread *use_thread) {
|
||||
if (*handle != nullptr) {
|
||||
auto ret = acl_handle_map.emplace(reinterpret_cast<void **>(handle), use_thread);
|
||||
if (!std::get<1>(ret)) {
|
||||
MS_LOG(ERROR) << "Failed to add new handle to acl_handle_map." << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void TdtHandle::DelHandle(acltdtChannelHandle **handle) {
|
||||
void **void_handle = reinterpret_cast<void **>(handle);
|
||||
acl_handle_map.erase(void_handle);
|
||||
}
|
||||
|
||||
inline bool TdtHandle::DestroyHandle() {
|
||||
bool destroy_all = true;
|
||||
for (auto &item : acl_handle_map) {
|
||||
acltdtChannelHandle **handle = reinterpret_cast<acltdtChannelHandle **>(item.first);
|
||||
if (*handle != nullptr) {
|
||||
aclError stop_status = acltdtStopChannel(*handle);
|
||||
if (stop_status != ACL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Failed stop acl data channel and the stop status is " << stop_status << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (item.second != nullptr && item.second->joinable()) {
|
||||
item.second->join();
|
||||
}
|
||||
if (acltdtDestroyChannel(*handle) != ACL_SUCCESS) {
|
||||
destroy_all = false;
|
||||
} else {
|
||||
*handle = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// clear the map container when all the handle has been destroyed
|
||||
if (destroy_all) {
|
||||
acl_handle_map.clear();
|
||||
}
|
||||
return destroy_all;
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_HANDLE_H_
|
|
@ -1,205 +0,0 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_plugin.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "minddata/dataset/engine/perf/profiling.h"
|
||||
#endif
|
||||
#include "minddata/dataset/util/log_adapter.h"
|
||||
#include "minddata/dataset/util/task_manager.h"
|
||||
#if ENABLE_D
|
||||
#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
TdtPlugin::TdtPlugin(const std::string &channel_name, int32_t device_id) : DeviceQueueBase(channel_name, device_id) {
|
||||
// init ErrorManager, 0 means success
|
||||
if (ErrorManager::GetInstance().Init() != 0) {
|
||||
MS_LOG(WARNING) << "[Internal Error] Init ErrorManager failed.";
|
||||
}
|
||||
// create acl tdt handle
|
||||
acl_handle_ = acltdtCreateChannel(device_id, channel_name.c_str());
|
||||
if (acl_handle_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Create channel for sending data failed, please check DEVICE ID setting, DEVICE ID that passed "
|
||||
"into dataset(from context) and training process should be the same.";
|
||||
ReportErrorMessage();
|
||||
}
|
||||
TdtHandle::AddHandle(&acl_handle_, nullptr);
|
||||
}
|
||||
|
||||
TdtPlugin::~TdtPlugin() {
|
||||
if (acl_handle_ != nullptr) {
|
||||
if (acltdtDestroyChannel(acl_handle_) != ACL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Failed to destroy channel for tdt queue.";
|
||||
ReportErrorMessage();
|
||||
} else {
|
||||
TdtHandle::DelHandle(&acl_handle_);
|
||||
acl_handle_ = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Status TdtPlugin::hostPush(TensorRow ts_row, bool profiling, int32_t *time, acltdtTensorType tdt_type) {
|
||||
MS_LOG(DEBUG) << "TDT channel name is " << channel_name_ << ".";
|
||||
acltdtDataset *acl_dataset = nullptr;
|
||||
#ifndef ENABLE_SECURITY
|
||||
double start_time;
|
||||
#endif
|
||||
auto ret = translate(tdt_type, ts_row, &acl_dataset);
|
||||
if (ret != Status::OK()) {
|
||||
DestroyAclDataset(acl_dataset);
|
||||
RETURN_STATUS_UNEXPECTED("Converting into TDT tensor failed!");
|
||||
}
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (profiling) {
|
||||
start_time = ProfilingTime::GetCurMilliSecond();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ENABLE_D
|
||||
// Data prefetch only when PS mode enables cache.
|
||||
if (acltdtGetDatasetSize(acl_dataset) > 0) {
|
||||
acltdtDataItem *item0 = acltdtGetDataItem(acl_dataset, 0);
|
||||
std::string item_type;
|
||||
RETURN_IF_NOT_OK(ParseType(acltdtGetDataTypeFromItem(item0), &item_type));
|
||||
if (!ps::PsDataPrefetch::GetInstance().PrefetchData(channel_name_, acltdtGetDataAddrFromItem(item0),
|
||||
acltdtGetDataSizeFromItem(item0), item_type)) {
|
||||
RETURN_STATUS_UNEXPECTED("PrefetchData failed in when pre-processing sending data.");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
auto status = acltdtSendTensor(acl_handle_, acl_dataset, -1);
|
||||
DestroyAclDataset(acl_dataset);
|
||||
if (status != ACL_SUCCESS) {
|
||||
// if the device_queue thread had been interrupted by master, just print warning and return success
|
||||
if (mindspore::dataset::this_thread::is_interrupted()) {
|
||||
MS_LOG(WARNING) << "Device queue thread had been interrupted by TdtHandle::DestroyHandle, you can ignore "
|
||||
<< "the above error: 'failed to send...'. In this scenario, the training ends first without "
|
||||
<< "using all epoch(s) data, and the data preprocessing is blocked by the data "
|
||||
<< "transmission channel on the device side. So we force the data transmission channel to stop.";
|
||||
return Status::OK();
|
||||
}
|
||||
ReportErrorMessage();
|
||||
RETURN_STATUS_UNEXPECTED("Tdt Send data failed.");
|
||||
}
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (profiling) {
|
||||
double end_time = ProfilingTime::GetCurMilliSecond();
|
||||
*time = static_cast<int32_t>(end_time - start_time);
|
||||
}
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TdtPlugin::ParseType(const aclDataType &acl_data_type, std::string *data_type) {
|
||||
auto type_iter = parse_map.find(acl_data_type);
|
||||
if (type_iter == parse_map.end()) {
|
||||
RETURN_STATUS_UNEXPECTED("Got unsupported acl datatype: " + std::to_string(acl_data_type));
|
||||
}
|
||||
*data_type = type_iter->second;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TdtPlugin::translate(acltdtTensorType tdt_type, const TensorRow &ts_row, acltdtDataset **output_acl_dataset) {
|
||||
auto acl_dataset = acltdtCreateDataset();
|
||||
if (acl_dataset == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("Create tdt dataset failed.");
|
||||
}
|
||||
auto status = AssembleTensor2AclDataset(tdt_type, ts_row, acl_dataset);
|
||||
if (status != Status::OK()) {
|
||||
DestroyAclDataset(acl_dataset);
|
||||
RETURN_STATUS_UNEXPECTED("Assemble tensor row to tdt dataset failed.");
|
||||
}
|
||||
|
||||
*output_acl_dataset = acl_dataset;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TdtPlugin::AssembleTensor2AclDataset(acltdtTensorType tdt_type, const TensorRow &ts_row,
|
||||
acltdtDataset *acl_dataset) {
|
||||
if (tdt_type != ACL_TENSOR_DATA_TENSOR || ts_row.size() == 0) {
|
||||
acltdtDataItem *acl_data = acltdtCreateDataItem(tdt_type, nullptr, 0, ACL_BOOL, nullptr, 0);
|
||||
if (acl_data == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("Create data item failed when send data with type:" + std::to_string(tdt_type));
|
||||
}
|
||||
if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_SUCCESS) {
|
||||
if (acltdtDestroyDataItem(acl_data) != ACL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Destroy data item failed when send data with type: " << tdt_type;
|
||||
}
|
||||
RETURN_STATUS_UNEXPECTED("Add data item to tdt dataset failed when send data.");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
for (auto ts : ts_row) {
|
||||
aclDataType datatype;
|
||||
acltdtDataItem *acl_data = nullptr;
|
||||
RETURN_IF_NOT_OK(GetAclDataType(ts->type(), &datatype));
|
||||
|
||||
TensorShape tsShape = ts->shape();
|
||||
std::string dataShapes = "[";
|
||||
for (auto dim : tsShape.AsVector()) {
|
||||
(void)dataShapes.append(std::to_string(dim)).append(",");
|
||||
}
|
||||
dataShapes.pop_back();
|
||||
(void)dataShapes.append("]");
|
||||
|
||||
std::shared_ptr<void> dataPtr =
|
||||
std::shared_ptr<void>(reinterpret_cast<uchar *>(&(*ts->begin<uint8_t>())), [](const void *elem) {});
|
||||
size_t dataLen = ts->SizeInBytes();
|
||||
const dsize_t dims = tsShape.Rank();
|
||||
std::vector<int64_t> dataShape;
|
||||
for (auto i = 0; i < dims; i++) {
|
||||
dataShape.emplace_back(tsShape[i]);
|
||||
}
|
||||
acl_data = acltdtCreateDataItem(ACL_TENSOR_DATA_TENSOR, (tsShape.empty() ? nullptr : &dataShape[0]), dims, datatype,
|
||||
dataPtr.get(), dataLen);
|
||||
if (acl_data == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("Create data item failed when send data.");
|
||||
}
|
||||
if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_SUCCESS) {
|
||||
if (acltdtDestroyDataItem(acl_data) != ACL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Destroy data item failed when send data with type ACL_TENSOR_DATA_TENSOR.";
|
||||
}
|
||||
RETURN_STATUS_UNEXPECTED("Add data item to tdt dataset failed when send data.");
|
||||
}
|
||||
|
||||
MS_LOG(DEBUG) << "TDT data type is TDT_TENSOR, tensor type is " << datatype << ", tensor shape is " << dataShapes
|
||||
<< ", data length is " << ts->Size() << ".";
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TdtPlugin::DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item) {
|
||||
if (include_data_item) {
|
||||
for (size_t i = 0; i < acltdtGetDatasetSize(acl_dataset); i++) {
|
||||
if (acltdtDestroyDataItem(acltdtGetDataItem(acl_dataset, i)) != ACL_SUCCESS) {
|
||||
ReportErrorMessage();
|
||||
RETURN_STATUS_UNEXPECTED("Destroy data item failed when send data.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (acltdtDestroyDataset(acl_dataset) != ACL_SUCCESS) {
|
||||
ReportErrorMessage();
|
||||
RETURN_STATUS_UNEXPECTED("Destroy tdt dataset failed when send data.");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -1,71 +0,0 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_PLUGIN_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_PLUGIN_H_
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
#include "acl/acl_tdt.h"
|
||||
#include "runtime/rt_mem_queue.h"
|
||||
#include "runtime/dev.h"
|
||||
#include "runtime/config.h"
|
||||
#include "graph/def_types.h"
|
||||
#include "common/util/error_manager/error_manager.h"
|
||||
#include "minddata/dataset/engine/device_queue_impl/device_queue_base.h"
|
||||
#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h"
|
||||
|
||||
#include "minddata/dataset/core/data_type.h"
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/core/tensor_row.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
class TdtPlugin : public DeviceQueueBase {
|
||||
public:
|
||||
static std::shared_ptr<TdtPlugin> GetInstance();
|
||||
|
||||
Status hostPush(TensorRow ts_row, bool profiling, int32_t *time, acltdtTensorType tdt_type = ACL_TENSOR_DATA_TENSOR);
|
||||
|
||||
TdtPlugin(const std::string &channel_name, int32_t device_id);
|
||||
|
||||
~TdtPlugin();
|
||||
|
||||
private:
|
||||
Status DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item = true);
|
||||
|
||||
Status AssembleTensor2AclDataset(acltdtTensorType tdt_type, const TensorRow &ts_row, acltdtDataset *acl_dataset);
|
||||
|
||||
Status ParseType(const aclDataType &acl_data_type, std::string *data_type);
|
||||
|
||||
Status translate(acltdtTensorType tdt_type, const TensorRow &ts_row, acltdtDataset **output_acl_dataset);
|
||||
|
||||
void *tdt_handle_ = nullptr;
|
||||
|
||||
const std::map<aclDataType, std::string> parse_map = {
|
||||
{ACL_INT8, "int8"}, {ACL_UINT8, "uint8"}, {ACL_INT16, "int16"}, {ACL_UINT16, "uint16"},
|
||||
{ACL_INT32, "int32"}, {ACL_UINT32, "uint32"}, {ACL_INT64, "int64"}, {ACL_UINT64, "uint64"},
|
||||
{ACL_FLOAT16, "float16"}, {ACL_FLOAT, "float32"}, {ACL_DOUBLE, "float64"}, {ACL_BOOL, "bool"}};
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DEVICE_QUEUE_IMPL_TDT_TDT_PLUGIN_H_
|
|
@ -19,7 +19,7 @@
|
|||
#include <limits>
|
||||
#include "minddata/dataset/engine/datasetops/dataset_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/device_queue_op.h"
|
||||
#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
|
||||
#ifdef WITH_BACKEND
|
||||
#include "mindspore/core/utils/numa_interface.h"
|
||||
#endif
|
||||
#include "minddata/dataset/util/task_manager.h"
|
||||
|
@ -28,7 +28,7 @@
|
|||
namespace mindspore {
|
||||
namespace dataset {
|
||||
// Constructor
|
||||
#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
|
||||
#ifdef WITH_BACKEND
|
||||
ExecutionTree::ExecutionTree() : ExecutionTree(GlobalContext::config_manager()) {}
|
||||
|
||||
ExecutionTree::ExecutionTree(std::shared_ptr<ConfigManager> cfg)
|
||||
|
@ -52,7 +52,7 @@ ExecutionTree::ExecutionTree() : id_count_(0), tree_state_(kDeTStateInit), prepa
|
|||
|
||||
// Destructor
|
||||
ExecutionTree::~ExecutionTree() {
|
||||
#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
|
||||
#ifdef WITH_BACKEND
|
||||
if (numa_enable_) {
|
||||
handle_ = nullptr;
|
||||
}
|
||||
|
@ -152,7 +152,7 @@ void ExecutionTree::PrintNode(std::ostream &out, const std::shared_ptr<DatasetOp
|
|||
Status ExecutionTree::Launch() {
|
||||
// opencv limit too many threads
|
||||
#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__) && !defined(ENABLE_ANDROID)
|
||||
#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
|
||||
#ifdef WITH_BACKEND
|
||||
// Here we do numa bind for performance optimization, as our test result,
|
||||
// if we do numa bind when get_dataset_size launch a tree, we'll get a
|
||||
// better performance than only we do numa bind at the time _To_Device
|
||||
|
|
|
@ -229,7 +229,7 @@ class ExecutionTree {
|
|||
TreeState tree_state_; // Tracking the current tree state
|
||||
std::string unique_id_; // A unique identifier for the tree
|
||||
|
||||
#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
|
||||
#ifdef WITH_BACKEND
|
||||
// Constructor for if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
|
||||
explicit ExecutionTree(std::shared_ptr<ConfigManager> cfg);
|
||||
// This rank_id is for numa and device_queue, one process work with only one rank_id,
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_GPU_ITEM_CONNECTOR_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_GPU_ITEM_CONNECTOR_H_
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
@ -24,7 +23,7 @@
|
|||
#include "minddata/dataset/engine/connector.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/include/dataset/constants.h"
|
||||
#include "runtime/data_queue/blocking_queue.h"
|
||||
#include "include/backend/data_queue/blocking_queue.h"
|
||||
|
||||
using mindspore::device::DataQueueItem;
|
||||
|
||||
|
@ -88,5 +87,4 @@ class GpuConnector : public Connector<GpuConnectorItem> {
|
|||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // ENABLE_GPUQUE
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_GPU_ITEM_CONNECTOR_H_
|
||||
|
|
|
@ -20,17 +20,17 @@
|
|||
#include <algorithm>
|
||||
#include "utils/ms_utils.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
#ifdef ENABLE_GPUQUE
|
||||
#include "minddata/dataset/core/config_manager.h"
|
||||
#include "minddata/dataset/core/global_context.h"
|
||||
#endif
|
||||
#include "minddata/dataset/engine/perf/monitor.h"
|
||||
#include "minddata/dataset/engine/perf/connector_size.h"
|
||||
#include "minddata/dataset/engine/perf/cpu_sampler.h"
|
||||
#include "minddata/dataset/engine/execution_tree.h"
|
||||
#include "minddata/dataset/engine/tree_adapter.h"
|
||||
#include "minddata/dataset/util/log_adapter.h"
|
||||
|
||||
#ifdef WITH_BACKEND
|
||||
#include "utils/ms_context.h"
|
||||
#endif
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
|
@ -824,18 +824,19 @@ ProfilingManager::ProfilingRegistrationState ProfilingManager::GetProfilerTreeSt
|
|||
}
|
||||
|
||||
std::string ProfilingManager::GetRankID() const {
|
||||
std::string rank_id;
|
||||
#ifdef ENABLE_GPUQUE
|
||||
std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
|
||||
int32_t rank_id_int = cfg->rank_id();
|
||||
// If DEVICE_ID is not set, default value is 0
|
||||
if (rank_id_int < 0) {
|
||||
rank_id = common::GetEnv("DEVICE_ID");
|
||||
} else {
|
||||
rank_id = std::to_string(rank_id_int);
|
||||
std::string rank_id = common::GetEnv("RANK_ID");
|
||||
#ifdef WITH_BACKEND
|
||||
MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
|
||||
if (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
|
||||
std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
|
||||
int32_t rank_id_int = cfg->rank_id();
|
||||
// If DEVICE_ID is not set, default value is 0
|
||||
if (rank_id_int < 0) {
|
||||
rank_id = common::GetEnv("DEVICE_ID");
|
||||
} else {
|
||||
rank_id = std::to_string(rank_id_int);
|
||||
}
|
||||
}
|
||||
#else
|
||||
rank_id = common::GetEnv("RANK_ID");
|
||||
#endif
|
||||
// If RANK_ID is not set, default value is 0
|
||||
if (rank_id.empty()) {
|
||||
|
|
|
@ -18,7 +18,9 @@
|
|||
#include <utility>
|
||||
#include "minddata/dataset/util/log_adapter.h"
|
||||
#include "minddata/dataset/util/system_pool.h"
|
||||
|
||||
#ifdef WITH_BACKEND
|
||||
#include "mindspore/ccsrc/include/backend/data_queue/data_queue_mgr.h"
|
||||
#endif
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
struct MemHdr {
|
||||
|
@ -243,29 +245,22 @@ std::ostream &operator<<(std::ostream &os, const ArenaImpl &s) {
|
|||
Status Arena::Init() {
|
||||
try {
|
||||
int64_t sz = size_in_MB_ * 1048576L;
|
||||
#ifdef ENABLE_GPUQUE
|
||||
#ifdef WITH_BACKEND
|
||||
if (is_cuda_malloc_) {
|
||||
auto ret = cudaHostAlloc(&ptr_, sz, cudaHostAllocDefault);
|
||||
if (ret != cudaSuccess) {
|
||||
MS_LOG(ERROR) << "cudaHostAlloc failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
|
||||
return Status(StatusCode::kMDOutOfMemory);
|
||||
}
|
||||
impl_ = std::make_unique<ArenaImpl>(ptr_, sz);
|
||||
ptr_ = device::DataQueueMgr::GetInstance().AllocHostMem("GPU", sz);
|
||||
} else {
|
||||
RETURN_IF_NOT_OK(DeMalloc(sz, &ptr_, false));
|
||||
impl_ = std::make_unique<ArenaImpl>(ptr_, sz);
|
||||
ptr_ = device::DataQueueMgr::GetInstance().AllocHostMem("Others", sz);
|
||||
}
|
||||
#else
|
||||
RETURN_IF_NOT_OK(DeMalloc(sz, &ptr_, false));
|
||||
impl_ = std::make_unique<ArenaImpl>(ptr_, sz);
|
||||
ptr_ = std::shared_ptr<void>(::malloc(sz), ::free);
|
||||
#endif
|
||||
impl_ = std::make_unique<ArenaImpl>(ptr_.get(), sz);
|
||||
} catch (std::bad_alloc &e) {
|
||||
return Status(StatusCode::kMDOutOfMemory);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
Arena::Arena(size_t val_in_MB, bool is_cuda_malloc)
|
||||
: ptr_(nullptr), size_in_MB_(val_in_MB), is_cuda_malloc_(is_cuda_malloc) {}
|
||||
|
||||
|
@ -279,19 +274,5 @@ Status Arena::CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB, bool i
|
|||
RETURN_IF_NOT_OK(ba->Init());
|
||||
return Status::OK();
|
||||
}
|
||||
#else
|
||||
Arena::Arena(size_t val_in_MB) : ptr_(nullptr), size_in_MB_(val_in_MB) {}
|
||||
|
||||
Status Arena::CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB) {
|
||||
RETURN_UNEXPECTED_IF_NULL(p_ba);
|
||||
auto ba = new (std::nothrow) Arena(val_in_MB);
|
||||
if (ba == nullptr) {
|
||||
return Status(StatusCode::kMDOutOfMemory);
|
||||
}
|
||||
(*p_ba).reset(ba);
|
||||
RETURN_IF_NOT_OK(ba->Init());
|
||||
return Status::OK();
|
||||
}
|
||||
#endif
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -22,9 +22,6 @@
|
|||
#include "minddata/dataset/util/allocator.h"
|
||||
#include "minddata/dataset/util/memory_pool.h"
|
||||
#include "minddata/dataset/util/treap.h"
|
||||
#ifdef ENABLE_GPUQUE
|
||||
#include <cuda_runtime_api.h>
|
||||
#endif
|
||||
|
||||
#define ARENA_LOG_BLK_SZ (6u)
|
||||
#define ARENA_BLK_SZ (static_cast<uint16_t>(1u << ARENA_LOG_BLK_SZ))
|
||||
|
@ -107,20 +104,7 @@ class Arena : public MemoryPool {
|
|||
// Disable copy and assignment constructor
|
||||
Arena(const Arena &) = delete;
|
||||
Arena &operator=(const Arena &) = delete;
|
||||
~Arena() override {
|
||||
#ifdef ENABLE_GPUQUE
|
||||
if (is_cuda_malloc_) {
|
||||
if (ptr_ != nullptr) {
|
||||
(void)cudaFreeHost(ptr_);
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (ptr_ != nullptr) {
|
||||
free(ptr_);
|
||||
}
|
||||
ptr_ = nullptr;
|
||||
#endif
|
||||
}
|
||||
~Arena() override = default;
|
||||
|
||||
/// As a derived class of MemoryPool, we have to implement the following.
|
||||
/// But we simply transfer the call to the implementation class
|
||||
|
@ -150,28 +134,17 @@ class Arena : public MemoryPool {
|
|||
os << *(s.impl_);
|
||||
return os;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
/// The only method to create an arena.
|
||||
static Status CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB = 4096, bool is_cuda_malloc = false);
|
||||
#else
|
||||
/// The only method to create an arena.
|
||||
static Status CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB = 4096);
|
||||
#endif
|
||||
|
||||
protected:
|
||||
mutable std::mutex mux_;
|
||||
std::unique_ptr<ArenaImpl> impl_;
|
||||
void *ptr_;
|
||||
std::shared_ptr<void> ptr_;
|
||||
size_t size_in_MB_;
|
||||
#ifdef ENABLE_GPUQUE
|
||||
|
||||
bool is_cuda_malloc_;
|
||||
|
||||
explicit Arena(size_t val_in_MB = 4096, bool is_cuda_malloc = false);
|
||||
#else
|
||||
|
||||
explicit Arena(size_t val_in_MB = 4096);
|
||||
#endif
|
||||
|
||||
Status Init();
|
||||
};
|
||||
|
|
|
@ -27,11 +27,7 @@ namespace dataset {
|
|||
Status CircularPool::AddOneArena() {
|
||||
Status rc;
|
||||
std::shared_ptr<Arena> b;
|
||||
#ifdef ENABLE_GPUQUE
|
||||
RETURN_IF_NOT_OK(Arena::CreateArena(&b, arena_size_, is_cuda_malloc_));
|
||||
#else
|
||||
RETURN_IF_NOT_OK(Arena::CreateArena(&b, arena_size_));
|
||||
#endif
|
||||
tail_ = b.get();
|
||||
cur_size_in_mb_ += arena_size_;
|
||||
mem_segments_.push_back(std::move(b));
|
||||
|
@ -198,22 +194,13 @@ int CircularPool::PercentFree() const {
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
CircularPool::CircularPool(int max_size_in_gb, int arena_size, bool is_cuda_malloc)
|
||||
: unlimited_(max_size_in_gb <= 0),
|
||||
max_size_in_mb_(unlimited_ ? std::numeric_limits<int32_t>::max() : max_size_in_gb * 1024),
|
||||
arena_size_(arena_size),
|
||||
is_cuda_malloc_(is_cuda_malloc),
|
||||
cur_size_in_mb_(0) {}
|
||||
#else
|
||||
CircularPool::CircularPool(int max_size_in_gb, int arena_size)
|
||||
: unlimited_(max_size_in_gb <= 0),
|
||||
max_size_in_mb_(unlimited_ ? std::numeric_limits<int32_t>::max() : max_size_in_gb * 1024),
|
||||
arena_size_(arena_size),
|
||||
cur_size_in_mb_(0) {}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
Status CircularPool::CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb, int arena_size,
|
||||
bool createOneArena, bool is_cuda_malloc) {
|
||||
Status rc;
|
||||
|
@ -234,28 +221,6 @@ Status CircularPool::CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, i
|
|||
}
|
||||
return rc;
|
||||
}
|
||||
#else
|
||||
Status CircularPool::CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb, int arena_size,
|
||||
bool createOneArena) {
|
||||
Status rc;
|
||||
if (out_pool == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("pPool is null");
|
||||
}
|
||||
auto pool = new (std::nothrow) CircularPool(max_size_in_gb, arena_size);
|
||||
if (pool == nullptr) {
|
||||
RETURN_STATUS_OOM("Out of memory.");
|
||||
}
|
||||
if (createOneArena) {
|
||||
rc = pool->AddOneArena();
|
||||
}
|
||||
if (rc.IsOk()) {
|
||||
(*out_pool).reset(pool);
|
||||
} else {
|
||||
delete pool;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
CircularPool::~CircularPool() = default;
|
||||
} // namespace dataset
|
||||
|
|
|
@ -85,13 +85,8 @@ class CircularPool : public MemoryPool {
|
|||
return os;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_GPUQUE
|
||||
static Status CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb = -1,
|
||||
int arena_size = 4096, bool create_one_arena = false, bool is_cuda_malloc = false);
|
||||
#else
|
||||
static Status CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb = -1,
|
||||
int arena_size = 4096, bool create_one_arena = false);
|
||||
#endif
|
||||
|
||||
private:
|
||||
ListOfArenas mem_segments_;
|
||||
|
@ -101,16 +96,10 @@ class CircularPool : public MemoryPool {
|
|||
int arena_size_;
|
||||
int cur_size_in_mb_;
|
||||
RWLock rw_lock_;
|
||||
#ifdef ENABLE_GPU
|
||||
bool is_cuda_malloc_;
|
||||
|
||||
// We can take negative or 0 as input which means unlimited.
|
||||
CircularPool(int max_size_in_gb, int arena_size, bool is_cuda_malloc);
|
||||
#else
|
||||
|
||||
// We can take negative or 0 as input which means unlimited.
|
||||
CircularPool(int max_size_in_gb, int arena_size);
|
||||
#endif
|
||||
|
||||
Status AddOneArena();
|
||||
};
|
||||
|
|
|
@ -22,12 +22,10 @@
|
|||
#if defined(__ANDROID__) || defined(ANDROID)
|
||||
#include "minddata/dataset/util/services.h"
|
||||
#endif
|
||||
#ifdef ENABLE_TDTQUE
|
||||
#include "acl/acl_tdt.h"
|
||||
#include "tdt/status.h"
|
||||
#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h"
|
||||
#ifdef WITH_BACKEND
|
||||
#include "utils/ms_context.h"
|
||||
#include "mindspore/ccsrc/include/backend/data_queue/data_queue_mgr.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
thread_local Task *gMyTask = nullptr;
|
||||
|
@ -141,6 +139,10 @@ Status Task::Run() {
|
|||
}
|
||||
|
||||
Status Task::Join(WaitFlag blocking) {
|
||||
#ifdef WITH_BACKEND
|
||||
MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
|
||||
std::string device_target = MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
#endif
|
||||
if (running_) {
|
||||
RETURN_UNEXPECTED_IF_NULL(MyTaskGroup());
|
||||
auto interrupt_svc = MyTaskGroup()->GetIntrpService();
|
||||
|
@ -162,23 +164,27 @@ Status Task::Join(WaitFlag blocking) {
|
|||
MS_LOG(WARNING) << MyName() << " Thread ID " << ss.str() << " is not responding. Interrupt again";
|
||||
interrupt_svc->InterruptAll();
|
||||
wait_times++;
|
||||
#ifdef ENABLE_TDTQUE
|
||||
// Because hostPush hung in DeviceQueueOp, wait 5 seconds and destroy the tdt
|
||||
if (wait_times > 5 && my_name_.find("DeviceQueueOp") != std::string::npos) {
|
||||
MS_LOG(WARNING) << "Wait " << wait_times << " seconds, "
|
||||
<< "the task: " << my_name_ << " will be destroyed by TdtHostDestory.";
|
||||
if (!TdtHandle::DestroyHandle()) {
|
||||
MS_LOG(WARNING) << "Destroy tdt channel failed.";
|
||||
} else {
|
||||
MS_LOG(INFO) << "Destroy tdt channel success.";
|
||||
}
|
||||
#ifdef WITH_BACKEND
|
||||
if (device_target == kAscendDevice) {
|
||||
// Because hostPush hung in DeviceQueueOp, wait 5 seconds and destroy the tdt
|
||||
if (wait_times > 5 && my_name_.find("DeviceQueueOp") != std::string::npos) {
|
||||
MS_LOG(WARNING) << "Wait " << wait_times << " seconds, "
|
||||
<< "the task: " << my_name_ << " will be destroyed by TdtHostDestory.";
|
||||
auto queue =
|
||||
device::DataQueueMgr::GetInstance().CreateDataQueue(kAscendDevice, {}, false, 0, nullptr, {});
|
||||
if (queue != nullptr && !queue->Destroy()) {
|
||||
MS_LOG(WARNING) << "Destroy tdt channel failed.";
|
||||
} else {
|
||||
MS_LOG(INFO) << "Destroy tdt channel success.";
|
||||
}
|
||||
|
||||
// just wait 30 seconds
|
||||
// case1: cpu usage 100%, DeviceQueueOp thread may destroy without thrd_ future
|
||||
if (wait_times > kWaitInterruptTaskTime) {
|
||||
MS_LOG(WARNING) << MyName() << " Thread ID " << ss.str()
|
||||
<< " is not responding. Maybe it's destroyed, task stop.";
|
||||
break;
|
||||
// just wait 30 seconds
|
||||
// case1: cpu usage 100%, DeviceQueueOp thread may destroy without thrd_ future
|
||||
if (wait_times > kWaitInterruptTaskTime) {
|
||||
MS_LOG(WARNING) << MyName() << " Thread ID " << ss.str()
|
||||
<< " is not responding. Maybe it's destroyed, task stop.";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1109,7 +1109,7 @@ bool TaskEmitAction(const ResourcePtr &resource) {
|
|||
resource->SetResult(kOutput, graph_id);
|
||||
return true;
|
||||
}
|
||||
std::vector<PrimitivePtr> cut_list = compile::nonlinear_ops;
|
||||
std::vector<PrimitivePtr> cut_list = compile::GetNonlinearOps();
|
||||
if (bc_ptr->name() == kMsConvert) {
|
||||
cut_list = compile::GetMsNonlinearOps();
|
||||
}
|
||||
|
|
|
@ -7,11 +7,6 @@ file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
|||
list(REMOVE_ITEM DEVICE_SRC_LIST "distribute/mpi_collective_group.cc"
|
||||
"distribute/collective_group_wrapper.cc" "distribute/mpi_pycc.cc")
|
||||
|
||||
if(ENABLE_TDTQUE)
|
||||
file(GLOB_RECURSE TDT_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${CMAKE_SOURCE_DIR}/mindspore/ccsrc/minddata/dataset/engine/tdt/tdt_handle.cc)
|
||||
endif()
|
||||
|
||||
if(ENABLE_MPI AND ENABLE_D)
|
||||
set_property(SOURCE "distribute/mpi_pycc.cc"
|
||||
PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
|
||||
|
|
|
@ -16,16 +16,113 @@
|
|||
|
||||
#include "plugin/device/ascend/hal/device/ascend_data_queue.h"
|
||||
#include <string>
|
||||
#include "plugin/device/ascend/hal/device/ascend_kernel_runtime.h"
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include "graph/def_types.h"
|
||||
#include "common/util/error_manager/error_manager.h"
|
||||
#include "include/backend/data_queue/data_queue_mgr.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#include "plugin/device/ascend/hal/device/ascend_kernel_runtime.h"
|
||||
#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace {
|
||||
const std::map<aclDataType, std::string> kAclTypeToString = {
|
||||
{ACL_INT8, "int8"}, {ACL_UINT8, "uint8"}, {ACL_INT16, "int16"}, {ACL_UINT16, "uint16"},
|
||||
{ACL_INT32, "int32"}, {ACL_UINT32, "uint32"}, {ACL_INT64, "int64"}, {ACL_UINT64, "uint64"},
|
||||
{ACL_FLOAT16, "float16"}, {ACL_FLOAT, "float32"}, {ACL_DOUBLE, "float64"}, {ACL_BOOL, "bool"}};
|
||||
|
||||
const std::map<std::string, aclDataType> kStringTypeToAclType = []() -> std::map<std::string, aclDataType> {
|
||||
std::map<std::string, aclDataType> ret;
|
||||
for (const auto &[acl_type, type_str] : kAclTypeToString) {
|
||||
ret.emplace(type_str, acl_type);
|
||||
}
|
||||
return ret;
|
||||
}();
|
||||
|
||||
constexpr auto kUnknownErrorString = "Unknown error occurred";
|
||||
std::map<void **, std::thread *> g_acl_handle_map = {};
|
||||
|
||||
constexpr auto kMbufHeadEndOfSequencePos = 128U;
|
||||
constexpr auto kEndOfSequenceFlag = 0x5A;
|
||||
constexpr auto kTransIdOffset = 64UL;
|
||||
constexpr auto kSleepMilliSeconds = 500;
|
||||
|
||||
std::atomic<bool> g_acl_destroy_all = false;
|
||||
|
||||
bool GetAclDataType(const std::string &str_type, aclDataType *acl_type) {
|
||||
MS_EXCEPTION_IF_NULL(acl_type);
|
||||
auto iter = kStringTypeToAclType.find(str_type);
|
||||
if (iter == kStringTypeToAclType.end()) {
|
||||
MS_LOG(EXCEPTION) << "Invalid type " << str_type;
|
||||
}
|
||||
*acl_type = iter->second;
|
||||
return true;
|
||||
}
|
||||
|
||||
void CheckRtRetWithError(rtError_t error, const std::string &msg) {
|
||||
if (error != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Rt error: " << msg << " | Error number: " << error;
|
||||
}
|
||||
}
|
||||
|
||||
void ReportErrorMessage() {
|
||||
const std::string &error_message = ErrorManager::GetInstance().GetErrorMessage();
|
||||
if (!error_message.empty() && error_message.find(kUnknownErrorString) == std::string::npos) {
|
||||
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace tdt_handle {
|
||||
void AddHandle(acltdtChannelHandle **handle, std::thread *use_thread) {
|
||||
if (*handle != nullptr) {
|
||||
auto ret = g_acl_handle_map.emplace(reinterpret_cast<void **>(handle), use_thread);
|
||||
if (!std::get<1>(ret)) {
|
||||
MS_LOG(ERROR) << "Failed to add new handle to acl_handle_map." << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DelHandle(acltdtChannelHandle **handle) {
|
||||
void **void_handle = reinterpret_cast<void **>(handle);
|
||||
g_acl_handle_map.erase(void_handle);
|
||||
}
|
||||
|
||||
bool DestroyHandle() {
|
||||
bool destroy_all = true;
|
||||
for (auto &item : g_acl_handle_map) {
|
||||
acltdtChannelHandle **handle = reinterpret_cast<acltdtChannelHandle **>(item.first);
|
||||
if (*handle != nullptr) {
|
||||
aclError stop_status = acltdtStopChannel(*handle);
|
||||
if (stop_status != ACL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Failed stop acl data channel and the stop status is " << stop_status << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (item.second != nullptr && item.second->joinable()) {
|
||||
item.second->join();
|
||||
}
|
||||
if (acltdtDestroyChannel(*handle) != ACL_SUCCESS) {
|
||||
MS_LOG(INFO) << "acltdtDestroyChannel failed.";
|
||||
destroy_all = false;
|
||||
} else {
|
||||
*handle = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// clear the map container when all the handle has been destroyed
|
||||
if (destroy_all) {
|
||||
g_acl_handle_map.clear();
|
||||
g_acl_destroy_all = true;
|
||||
}
|
||||
return destroy_all;
|
||||
}
|
||||
|
||||
bool IsClosed() { return g_acl_destroy_all; }
|
||||
} // namespace tdt_handle
|
||||
|
||||
AscendDataQueueDynamic::AscendDataQueueDynamic(const size_t capacity)
|
||||
: DataQueue(capacity), stream_(nullptr), node_info_(nullptr) {
|
||||
auto context_key = device_context_->device_context_key();
|
||||
|
@ -35,12 +132,12 @@ AscendDataQueueDynamic::AscendDataQueueDynamic(const size_t capacity)
|
|||
stream_ = runtime_instance->compute_stream();
|
||||
}
|
||||
|
||||
BlockQueueStatus_T AscendDataQueueDynamic::Push(std::vector<DataQueueItem> data) {
|
||||
DataQueueStatus AscendDataQueueDynamic::Push(std::vector<DataQueueItem> data) {
|
||||
for (size_t i = 0; i < data.size(); i++) {
|
||||
auto &item = data[i];
|
||||
if (item.data_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Invalid Input: ptr: " << item.data_ptr_ << ", len: " << item.data_len_;
|
||||
return ERROR_INPUT;
|
||||
return DataQueueStatus::ERROR_INPUT;
|
||||
}
|
||||
void *addr = device_context_->device_res_manager_->AllocateMemory(item.data_len_);
|
||||
if (addr == nullptr) {
|
||||
|
@ -52,26 +149,480 @@ BlockQueueStatus_T AscendDataQueueDynamic::Push(std::vector<DataQueueItem> data)
|
|||
item.device_addr_ = addr;
|
||||
}
|
||||
CheckRtRetWithError(rtStreamSynchronize(stream_), "Call runtime rtStreamSynchronize failed");
|
||||
node_info_[tail_].data_ = data;
|
||||
node_info_[tail_].data_ = std::move(data);
|
||||
tail_ = (tail_ + 1) % (capacity_);
|
||||
++size_;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T AscendDataQueueDynamic::Front(std::vector<DataQueueItem> *data) const {
|
||||
DataQueueStatus AscendDataQueueDynamic::Front(std::vector<DataQueueItem> *data) const {
|
||||
for (auto &item : node_info_[head_].data_) {
|
||||
host_release_(item.data_ptr_, item.worker_id_);
|
||||
}
|
||||
*data = node_info_[head_].data_;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T AscendDataQueueDynamic::Pop() {
|
||||
DataQueueStatus AscendDataQueueDynamic::Pop() {
|
||||
head_ = (head_ + 1) % (capacity_);
|
||||
--size_;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
bool AscendDataQueueDynamic::Destroy() { return true; }
|
||||
|
||||
AscendTdtQueue::AscendTdtQueue(const std::string &channel_name) : DataQueue(0), channel_name_(channel_name) {
|
||||
// init ErrorManager, 0 means success
|
||||
if (ErrorManager::GetInstance().Init() != 0) {
|
||||
MS_LOG(WARNING) << "[Internal Error] Init ErrorManager failed.";
|
||||
}
|
||||
// get device id
|
||||
MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
|
||||
device_id_ = MsContext::GetInstance()->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
|
||||
// create acl tdt handle
|
||||
if (!channel_name.empty()) {
|
||||
acl_handle_ = acltdtCreateChannel(device_id_, channel_name.c_str());
|
||||
if (acl_handle_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Create channel for sending data failed, please check DEVICE ID setting, DEVICE ID that passed "
|
||||
"into dataset(from context) and training process should be the same.";
|
||||
ReportErrorMessage();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AscendTdtQueue::~AscendTdtQueue() {
|
||||
if (acl_handle_ != nullptr) {
|
||||
if (acltdtDestroyChannel(acl_handle_) != ACL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Failed to destroy channel for tdt queue.";
|
||||
ReportErrorMessage();
|
||||
} else {
|
||||
tdt_handle::DelHandle(&acl_handle_);
|
||||
acl_handle_ = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool AscendTdtQueue::Destroy() { return tdt_handle::DestroyHandle(); }
|
||||
|
||||
bool AscendTdtQueue::IsOpen() const { return !tdt_handle::IsClosed(); }
|
||||
|
||||
DataQueueStatus AscendTdtQueue::Push(std::vector<DataQueueItem> data) {
|
||||
MS_LOG(DEBUG) << "TDT channel name is " << channel_name_ << ".";
|
||||
acltdtDataset *acl_dataset = nullptr;
|
||||
auto ret = Translate(data, &acl_dataset);
|
||||
if (!ret) {
|
||||
DestroyAclDataset(acl_dataset);
|
||||
MS_LOG(ERROR) << "Converting into TDT tensor failed!";
|
||||
return DataQueueStatus::INTERNAL_ERROR;
|
||||
}
|
||||
|
||||
// Data prefetch only when PS mode enables cache.
|
||||
if (acltdtGetDatasetSize(acl_dataset) > 0) {
|
||||
acltdtDataItem *item0 = acltdtGetDataItem(acl_dataset, 0);
|
||||
std::string item_type;
|
||||
ParseType(acltdtGetDataTypeFromItem(item0), &item_type);
|
||||
if (!ps::PsDataPrefetch::GetInstance().PrefetchData(channel_name_, acltdtGetDataAddrFromItem(item0),
|
||||
acltdtGetDataSizeFromItem(item0), item_type)) {
|
||||
MS_LOG(ERROR) << "PrefetchData failed in when pre-processing sending data.";
|
||||
return DataQueueStatus::INTERNAL_ERROR;
|
||||
}
|
||||
}
|
||||
auto status = acltdtSendTensor(acl_handle_, acl_dataset, -1);
|
||||
DestroyAclDataset(acl_dataset);
|
||||
if (status != ACL_SUCCESS) {
|
||||
// if the device_queue thread had been interrupted by master, just print warning and return success
|
||||
if (tdt_handle::IsClosed()) {
|
||||
MS_LOG(WARNING) << "Device queue thread had been interrupted by TdtHandle::DestroyHandle, you can ignore "
|
||||
<< "the above error: 'failed to send...'. In this scenario, the training ends first without "
|
||||
<< "using all epoch(s) data, and the data preprocessing is blocked by the data "
|
||||
<< "transmission channel on the device side. So we force the data transmission channel to stop.";
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
ReportErrorMessage();
|
||||
MS_LOG(ERROR) << "Tdt Send data failed.";
|
||||
return DataQueueStatus::INTERNAL_ERROR;
|
||||
}
|
||||
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
void AscendTdtQueue::ParseType(aclDataType acl_data_type, std::string *data_type) {
|
||||
auto type_iter = kAclTypeToString.find(acl_data_type);
|
||||
if (type_iter == kAclTypeToString.end()) {
|
||||
MS_LOG(EXCEPTION) << "Got unsupported acl datatype: " << acl_data_type;
|
||||
}
|
||||
*data_type = type_iter->second;
|
||||
}
|
||||
|
||||
bool AscendTdtQueue::Translate(const std::vector<DataQueueItem> &data, acltdtDataset **output_acl_dataset) {
|
||||
auto acl_dataset = acltdtCreateDataset();
|
||||
if (acl_dataset == nullptr) {
|
||||
MS_LOG(ERROR) << "Create tdt dataset failed.";
|
||||
return false;
|
||||
}
|
||||
bool status = AssembleTensor2AclDataset(data, acl_dataset);
|
||||
if (!status) {
|
||||
DestroyAclDataset(acl_dataset);
|
||||
MS_LOG(ERROR) << "Assemble tensor row to tdt dataset failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
*output_acl_dataset = acl_dataset;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AscendTdtQueue::AssembleTensor2AclDataset(const std::vector<DataQueueItem> &data, acltdtDataset *acl_dataset) {
|
||||
if (data.empty()) {
|
||||
acltdtDataItem *acl_data =
|
||||
acltdtCreateDataItem(acltdtTensorType::ACL_TENSOR_DATA_END_OF_SEQUENCE, nullptr, 0, ACL_BOOL, nullptr, 0);
|
||||
if (acl_data == nullptr) {
|
||||
MS_LOG(ERROR) << "Create data item failed when send empty data.";
|
||||
return false;
|
||||
}
|
||||
if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_SUCCESS) {
|
||||
if (acltdtDestroyDataItem(acl_data) != ACL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Destroy data item failed when send empty data.";
|
||||
}
|
||||
MS_LOG(ERROR) << "Add data item to tdt dataset failed when send data.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
for (const auto &ts : data) {
|
||||
aclDataType acl_type;
|
||||
acltdtDataItem *acl_data = nullptr;
|
||||
if (!GetAclDataType(ts.data_type_, &acl_type)) {
|
||||
MS_LOG(ERROR) << "Convert type " << ts.data_type_ << " to acl type failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto &shape = ts.shapes_;
|
||||
std::string shape_str = "[";
|
||||
for (auto dim : shape) {
|
||||
(void)shape_str.append(std::to_string(dim)).append(",");
|
||||
}
|
||||
shape_str.pop_back();
|
||||
(void)shape_str.append("]");
|
||||
|
||||
void *data_ptr = ts.data_ptr_;
|
||||
size_t data_size = ts.data_len_;
|
||||
|
||||
acl_data = acltdtCreateDataItem(acltdtTensorType::ACL_TENSOR_DATA_TENSOR, (shape.empty() ? nullptr : &shape[0]),
|
||||
shape.size(), acl_type, data_ptr, data_size);
|
||||
if (acl_data == nullptr) {
|
||||
MS_LOG(ERROR) << "Create data item failed when send data.";
|
||||
return false;
|
||||
}
|
||||
if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_SUCCESS) {
|
||||
if (acltdtDestroyDataItem(acl_data) != ACL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Destroy data item failed when send data with type ACL_TENSOR_DATA_TENSOR.";
|
||||
}
|
||||
MS_LOG(INFO) << "Add data item to tdt dataset failed when send data.";
|
||||
return false;
|
||||
}
|
||||
|
||||
MS_LOG(DEBUG) << "TDT data type is TDT_TENSOR, tensor type is " << acl_type << ", tensor shape is " << shape_str
|
||||
<< ", data length is " << data_size << ".";
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void AscendTdtQueue::DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item) {
|
||||
if (include_data_item) {
|
||||
for (size_t i = 0; i < acltdtGetDatasetSize(acl_dataset); i++) {
|
||||
if (acltdtDestroyDataItem(acltdtGetDataItem(acl_dataset, i)) != ACL_SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "Destroy data item failed when send data.";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (acltdtDestroyDataset(acl_dataset) != ACL_SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "Destroy tdt dataset failed when send data.";
|
||||
}
|
||||
}
|
||||
|
||||
AscendHostQueue::AscendHostQueue(const std::string &channel_name)
|
||||
: DataQueue(0), channel_name_(channel_name), queue_id_to_trans_id_map_(), queue_id_(0) {
|
||||
// init ErrorManager, 0 means success
|
||||
if (ErrorManager::GetInstance().Init() != 0) {
|
||||
MS_LOG(WARNING) << "[Internal Error] Init ErrorManager failed.";
|
||||
}
|
||||
// get device id
|
||||
MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
|
||||
device_id_ = MsContext::GetInstance()->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
|
||||
if (!HostQueueInit()) {
|
||||
MS_LOG(WARNING) << "Host queue init failed.";
|
||||
}
|
||||
}
|
||||
|
||||
DataQueueStatus AscendHostQueue::Push(std::vector<DataQueueItem> data) {
|
||||
if (!SendDataByHostQueue(data)) {
|
||||
return DataQueueStatus::INTERNAL_ERROR;
|
||||
}
|
||||
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
bool AscendHostQueue::HostQueueInit() {
|
||||
auto ret = rtSetDevice(device_id_);
|
||||
if (ret != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtSetDevice failed, ret =" << ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
ret = rtMemQueueInit(device_id_);
|
||||
if (ret != ACL_RT_SUCCESS && ret != ACL_ERROR_RT_REPEATED_INIT) {
|
||||
MS_LOG(ERROR) << "call rtMemQueueInit failed, ret =" << ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
rtMemQueueAttr_t attr = {};
|
||||
auto mem_ret = memset_s(attr.name, RT_MQ_MAX_NAME_LEN, 0, RT_MQ_MAX_NAME_LEN);
|
||||
if (mem_ret != EOK) {
|
||||
MS_LOG(ERROR) << "call memset_s failed, ret =" << mem_ret;
|
||||
return false;
|
||||
}
|
||||
mem_ret = memcpy_s(attr.name, RT_MQ_MAX_NAME_LEN, channel_name_.c_str(), channel_name_.size() + 1);
|
||||
if (mem_ret != EOK) {
|
||||
MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
attr.depth = 128U;
|
||||
attr.workMode = RT_MQ_MODE_DEFAULT;
|
||||
attr.flowCtrlFlag = false;
|
||||
attr.flowCtrlDropTime = 0U;
|
||||
attr.overWriteFlag = false;
|
||||
ret = rtMemQueueCreate(device_id_, &attr, &queue_id_);
|
||||
if (ret != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtMemQueueCreate failed, ret =" << ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
rtMemBuffCfg_t buff_cfg = {};
|
||||
ret = rtMbufInit(&buff_cfg);
|
||||
if (ret != ACL_RT_SUCCESS && ret != ACL_ERROR_RT_REPEATED_INIT) {
|
||||
MS_LOG(ERROR) << "call rtMbufInit failed, ret =" << ret;
|
||||
return false;
|
||||
}
|
||||
const std::lock_guard<std::mutex> lk(queue_id_to_trans_id_map_mutex_);
|
||||
(void)queue_id_to_trans_id_map_.emplace(queue_id_, 0UL);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AscendHostQueue::SendDataByHostQueue(const std::vector<DataQueueItem> &data) {
|
||||
bool status;
|
||||
bool is_need_resend = false;
|
||||
void *buff = nullptr;
|
||||
// Status status;
|
||||
if (!LaunchTensor2MBuff(data, &buff)) {
|
||||
return false;
|
||||
}
|
||||
do {
|
||||
if (is_need_resend) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(kSleepMilliSeconds));
|
||||
}
|
||||
status = EnqueueData(buff, &is_need_resend);
|
||||
} while (status && is_need_resend);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AscendHostQueue::SetTransId4MBuf(void **buff) {
|
||||
void *head_buff = nullptr;
|
||||
uint64_t head_size = 0UL;
|
||||
auto ret = rtMbufGetPrivInfo(*buff, &head_buff, &head_size);
|
||||
if (ret != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtMbufGetPrivInfo failed, ret =" << ret;
|
||||
return false;
|
||||
}
|
||||
uint64_t *trans_id = reinterpret_cast<uint64_t *>(static_cast<uint8_t *>(head_buff) + head_size - kTransIdOffset);
|
||||
const std::lock_guard<std::mutex> lk(queue_id_to_trans_id_map_mutex_);
|
||||
*trans_id = ++queue_id_to_trans_id_map_[queue_id_];
|
||||
MS_LOG(DEBUG) << "host queue[" << queue_id_ << "] set trans id[" << *trans_id << "] success";
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AscendHostQueue::LaunchTensor2MBuff(const std::vector<DataQueueItem> &data, void **buff) {
|
||||
std::vector<DataItemInfo> items;
|
||||
if (!CreateDataItemInfos(data, &items)) {
|
||||
return false;
|
||||
}
|
||||
if (!SerializeDataItemInfos(&items, buff)) {
|
||||
return false;
|
||||
}
|
||||
if (!SetTransId4MBuf(buff)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AscendHostQueue::EnqueueData(void *buff, bool *need_resend) {
|
||||
MS_EXCEPTION_IF_NULL(need_resend);
|
||||
*need_resend = false;
|
||||
auto rt_error = rtSetDevice(device_id_);
|
||||
if (rt_error != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtSetDevice device failed, ret=" << rt_error;
|
||||
return false;
|
||||
}
|
||||
rt_error = rtMemQueueEnQueue(device_id_, queue_id_, buff);
|
||||
if (rt_error == RT_ERROR_NONE) {
|
||||
return true;
|
||||
} else if (rt_error == ACL_ERROR_RT_QUEUE_FULL) {
|
||||
*need_resend = true;
|
||||
MS_LOG(DEBUG) << "queue[" << queue_id_ << "] is full, need call rtMemQueueEnQueue again";
|
||||
} else {
|
||||
HostQueueFreeBuff(buff);
|
||||
MS_LOG(ERROR) << "host enqueue queue[" << queue_id_ << "] failed, ret = " << rt_error;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AscendHostQueue::CreateDataItemInfos(const std::vector<DataQueueItem> &data, std::vector<DataItemInfo> *items) {
|
||||
MS_EXCEPTION_IF_NULL(items);
|
||||
if (data.empty()) {
|
||||
items->emplace_back(BuildDataItemInfo(ACL_TENSOR_DATA_END_OF_SEQUENCE, ACL_BOOL, nullptr, 0UL, nullptr, 0UL));
|
||||
return true;
|
||||
}
|
||||
|
||||
for (auto &ts : data) {
|
||||
aclDataType acl_type;
|
||||
if (!GetAclDataType(ts.data_type_, &acl_type)) {
|
||||
MS_LOG(ERROR) << "Convert type " << ts.data_type_ << " to acl type failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto &shape = ts.shapes_;
|
||||
void *data_ptr = ts.data_ptr_;
|
||||
size_t data_size = ts.data_len_;
|
||||
|
||||
if (ts.data_type_ != "string") {
|
||||
items->emplace_back(BuildDataItemInfo(ACL_TENSOR_DATA_TENSOR, static_cast<int32_t>(acl_type),
|
||||
(shape.empty() ? nullptr : &shape[0]), shape.size(), data_ptr, data_size));
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Create data item failed when send data with type:" << ts.data_type_;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AscendHostQueue::SerializeDataItemInfos(std::vector<DataItemInfo> *items, void **buff) {
|
||||
size_t cnt = items->size();
|
||||
size_t total_size = 0UL;
|
||||
for (size_t i = 0UL; i < cnt; ++i) {
|
||||
(*items)[i].ctrl_info.cur_cnt = i;
|
||||
(*items)[i].ctrl_info.cnt = cnt;
|
||||
total_size +=
|
||||
sizeof(DataItemInfo::ItemInfo) + (*items)[i].ctrl_info.dim_num * sizeof(int64_t) + (*items)[i].ctrl_info.data_len;
|
||||
}
|
||||
|
||||
auto rt_error = rtMbufAlloc(buff, total_size);
|
||||
if (rt_error != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtMbufAlloc with size[" << total_size << "] failed, ret = " << rt_error;
|
||||
return false;
|
||||
}
|
||||
|
||||
void *data = nullptr;
|
||||
rt_error = rtMbufGetBuffAddr(*buff, &data);
|
||||
if (rt_error != ACL_RT_SUCCESS) {
|
||||
(void)rtMbufFree(*buff);
|
||||
MS_LOG(ERROR) << "call rtMbufGetBuffAddr with size[" << total_size << "] failed, ret = " << rt_error;
|
||||
return false;
|
||||
}
|
||||
|
||||
void *head_buf = nullptr;
|
||||
uint64_t head_size = 0UL;
|
||||
rt_error = rtMbufGetPrivInfo(*buff, &head_buf, &head_size);
|
||||
if (rt_error != ACL_RT_SUCCESS) {
|
||||
(void)rtMbufFree(*buff);
|
||||
MS_LOG(ERROR) << "call rtMbufGetPrivInfo failed, ret =" << rt_error;
|
||||
return false;
|
||||
}
|
||||
if ((head_buf != nullptr) && (head_size > kMbufHeadEndOfSequencePos)) {
|
||||
MS_LOG(DEBUG) << "host queue set end_of_sequence mbuf head.";
|
||||
}
|
||||
|
||||
size_t offset = 0UL;
|
||||
for (size_t i = 0UL; i < cnt; ++i) {
|
||||
auto mem_ret = memcpy_s(::ge::ValueToPtr(::ge::PtrToValue(data) + offset), sizeof(DataItemInfo::ItemInfo),
|
||||
&((*items)[i].ctrl_info), sizeof(DataItemInfo::ItemInfo));
|
||||
if (mem_ret != EOK) {
|
||||
(void)rtMbufFree(*buff);
|
||||
MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
|
||||
return false;
|
||||
}
|
||||
offset += sizeof(DataItemInfo::ItemInfo);
|
||||
|
||||
for (size_t j = 0UL; j < (*items)[i].ctrl_info.dim_num; ++j) {
|
||||
mem_ret = memcpy_s(::ge::ValueToPtr(::ge::PtrToValue(data) + offset), sizeof(int64_t), &((*items)[i].dims[j]),
|
||||
sizeof(int64_t));
|
||||
if (mem_ret != EOK) {
|
||||
(void)rtMbufFree(*buff);
|
||||
MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
|
||||
return false;
|
||||
}
|
||||
offset += sizeof(int64_t);
|
||||
}
|
||||
|
||||
if ((*items)[i].ctrl_info.data_len == 0UL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
mem_ret = memcpy_s(::ge::ValueToPtr(::ge::PtrToValue(data) + offset), (*items)[i].ctrl_info.data_len,
|
||||
(*items)[i].data_ptr, (*items)[i].ctrl_info.data_len);
|
||||
if (mem_ret != EOK) {
|
||||
(void)rtMbufFree(*buff);
|
||||
MS_LOG(ERROR) << "call memcpy_s failed, ret =" << mem_ret;
|
||||
return false;
|
||||
}
|
||||
offset += (*items)[i].ctrl_info.data_len;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
AscendHostQueue::DataItemInfo AscendHostQueue::BuildDataItemInfo(acltdtTensorType acl_data_type, int32_t tensor_type,
|
||||
const int64_t *dims, size_t dim_size, void *data_ptr,
|
||||
uint64_t data_len) {
|
||||
DataItemInfo item = {};
|
||||
item.ctrl_info.data_type = static_cast<int32_t>(acl_data_type);
|
||||
item.ctrl_info.tensor_type = tensor_type;
|
||||
item.ctrl_info.dim_num = dim_size;
|
||||
item.ctrl_info.data_len = data_len;
|
||||
item.dims = std::vector<int64_t>(dims, dims + dim_size);
|
||||
item.data_ptr = data_ptr;
|
||||
return item;
|
||||
}
|
||||
|
||||
void AscendHostQueue::HostQueueFreeBuff(void *buff) {
|
||||
auto rt_error = rtMbufFree(buff);
|
||||
if (rt_error != ACL_RT_SUCCESS) {
|
||||
MS_LOG(ERROR) << "call rtMbufFree failed, ret=" << rt_error;
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
std::shared_ptr<DataQueue> CreateAscendDataQueue(const std::string &channel_name, bool dynamic_shape, size_t capacity,
|
||||
void *, const std::vector<size_t> &) {
|
||||
if (dynamic_shape) {
|
||||
return std::make_shared<AscendDataQueueDynamic>(capacity);
|
||||
}
|
||||
|
||||
int32_t is_heterogeneous = 0;
|
||||
(void)rtGetIsHeterogenous(&is_heterogeneous);
|
||||
if (is_heterogeneous != 0) {
|
||||
return std::make_shared<AscendHostQueue>(channel_name);
|
||||
} else {
|
||||
return std::make_shared<AscendTdtQueue>(channel_name);
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_DATA_QUEUE_CREATOR(kAscendDevice, CreateAscendDataQueue);
|
||||
} // namespace
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -20,23 +20,27 @@
|
|||
#include <unistd.h>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <functional>
|
||||
#include "runtime/hardware/device_context_manager.h"
|
||||
#include "runtime/data_queue/data_queue.h"
|
||||
#include "runtime/rt.h"
|
||||
#include "include/backend/data_queue/data_queue.h"
|
||||
#include "include/backend/visible.h"
|
||||
#include "runtime/rt.h"
|
||||
#include "acl/acl_tdt.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
class BACKEND_EXPORT AscendDataQueueDynamic : public DataQueue {
|
||||
public:
|
||||
explicit AscendDataQueueDynamic(const size_t capacity);
|
||||
virtual ~AscendDataQueueDynamic() = default;
|
||||
~AscendDataQueueDynamic() override = default;
|
||||
|
||||
BlockQueueStatus_T Push(std::vector<DataQueueItem> data);
|
||||
BlockQueueStatus_T Front(std::vector<DataQueueItem> *data) const;
|
||||
BlockQueueStatus_T Pop();
|
||||
bool Destroy();
|
||||
DataQueueStatus Push(std::vector<DataQueueItem> data) override;
|
||||
DataQueueStatus Front(std::vector<DataQueueItem> *data) const override;
|
||||
DataQueueStatus Pop() override;
|
||||
bool Destroy() override;
|
||||
void SetThreadDevice() override {}
|
||||
|
||||
private:
|
||||
struct NodeInfo {
|
||||
|
@ -45,7 +49,82 @@ class BACKEND_EXPORT AscendDataQueueDynamic : public DataQueue {
|
|||
rtStream_t stream_;
|
||||
std::unique_ptr<NodeInfo[]> node_info_;
|
||||
};
|
||||
|
||||
namespace tdt_handle {
|
||||
void AddHandle(acltdtChannelHandle **handle, std::thread *use_thread);
|
||||
bool DestroyHandle();
|
||||
void DelHandle(acltdtChannelHandle **handle);
|
||||
bool IsClosed();
|
||||
} // namespace tdt_handle
|
||||
|
||||
class AscendTdtQueue : public DataQueue {
|
||||
public:
|
||||
explicit AscendTdtQueue(const std::string &channel_name);
|
||||
~AscendTdtQueue() override;
|
||||
|
||||
bool IsOpen() const override;
|
||||
|
||||
DataQueueStatus Push(std::vector<DataQueueItem> data) override;
|
||||
DataQueueStatus Front(std::vector<DataQueueItem> *data) const override { return DataQueueStatus::SUCCESS; }
|
||||
DataQueueStatus Pop() override { return DataQueueStatus::SUCCESS; }
|
||||
bool Destroy() override;
|
||||
void SetThreadDevice() override {}
|
||||
|
||||
private:
|
||||
void DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item = true);
|
||||
bool AssembleTensor2AclDataset(const std::vector<DataQueueItem> &data, acltdtDataset *acl_dataset);
|
||||
void ParseType(aclDataType acl_data_type, std::string *data_type);
|
||||
bool Translate(const std::vector<DataQueueItem> &data, acltdtDataset **output_acl_dataset);
|
||||
|
||||
acltdtChannelHandle *acl_handle_;
|
||||
std::string channel_name_;
|
||||
uint32_t device_id_;
|
||||
};
|
||||
|
||||
class AscendHostQueue : public DataQueue {
|
||||
public:
|
||||
explicit AscendHostQueue(const std::string &channel_name);
|
||||
~AscendHostQueue() override = default;
|
||||
|
||||
DataQueueStatus Push(std::vector<DataQueueItem> data) override;
|
||||
DataQueueStatus Front(std::vector<DataQueueItem> *data) const override { return DataQueueStatus::SUCCESS; }
|
||||
DataQueueStatus Pop() override { return DataQueueStatus::SUCCESS; }
|
||||
bool Destroy() override { return true; }
|
||||
void SetThreadDevice() override {}
|
||||
|
||||
struct DataItemInfo {
|
||||
struct ItemInfo {
|
||||
int32_t version;
|
||||
int32_t data_type;
|
||||
uint32_t cur_cnt;
|
||||
uint32_t cnt;
|
||||
int32_t tensor_type;
|
||||
uint32_t dim_num;
|
||||
char reserved[32];
|
||||
uint64_t data_len;
|
||||
} ctrl_info;
|
||||
std::vector<int64_t> dims;
|
||||
void *data_ptr;
|
||||
};
|
||||
|
||||
private:
|
||||
bool HostQueueInit();
|
||||
bool SendDataByHostQueue(const std::vector<DataQueueItem> &data);
|
||||
bool SetTransId4MBuf(void **buff);
|
||||
bool LaunchTensor2MBuff(const std::vector<DataQueueItem> &data, void **buff);
|
||||
bool EnqueueData(void *buff, bool *need_resend);
|
||||
bool CreateDataItemInfos(const std::vector<DataQueueItem> &data, std::vector<DataItemInfo> *items);
|
||||
bool SerializeDataItemInfos(std::vector<DataItemInfo> *items, void **buff);
|
||||
DataItemInfo BuildDataItemInfo(acltdtTensorType acl_data_type, int32_t tensor_type, const int64_t *dims,
|
||||
size_t dim_size, void *data_ptr, uint64_t data_len);
|
||||
void HostQueueFreeBuff(void *buff);
|
||||
|
||||
std::string channel_name_;
|
||||
uint32_t device_id_;
|
||||
std::mutex queue_id_to_trans_id_map_mutex_;
|
||||
std::map<uint32_t, uint64_t> queue_id_to_trans_id_map_;
|
||||
uint32_t queue_id_;
|
||||
};
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_BLOCKING_QUEUE_H_
|
||||
|
|
|
@ -62,10 +62,7 @@
|
|||
#endif
|
||||
#include "include/common/utils/config_manager.h"
|
||||
#include "plugin/device/ascend/hal/hccl_adapter/hccl_adapter.h"
|
||||
#ifdef ENABLE_TDTQUE
|
||||
#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h"
|
||||
using mindspore::dataset::TdtHandle;
|
||||
#endif
|
||||
#include "plugin/device/ascend/hal/device/ascend_data_queue.h"
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
#include "include/common/debug/rdr/recorder_manager.h"
|
||||
#endif
|
||||
|
@ -1102,10 +1099,10 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph &graph) {
|
|||
#ifndef ENABLE_SECURITY
|
||||
DumpTaskExceptionInfo(graph);
|
||||
#endif
|
||||
#ifdef ENABLE_TDTQUE
|
||||
#ifdef WITH_BACKEND
|
||||
// Run task error, we should call TdtHostDestroy to release tdt to avoid DeviceQueueOp hostPush hung
|
||||
// case1: cpu usage 100% cause thread/process exit, but some tdt thread remain in backend
|
||||
if (!TdtHandle::DestroyHandle()) {
|
||||
if (!tdt_handle::DestroyHandle()) {
|
||||
MS_LOG(WARNING) << "Destroy tdt channel failed.";
|
||||
} else {
|
||||
MS_LOG(INFO) << "Destroy tdt channel success.";
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
#include "ir/dtype/type.h"
|
||||
#include "acl/acl_tdt.h"
|
||||
#include "proto/print.pb.h"
|
||||
#include "minddata/dataset/engine/device_queue_impl/tdt/tdt_handle.h"
|
||||
#include "plugin/device/ascend/hal/device/ascend_data_queue.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
namespace mindspore::device::ascend {
|
||||
|
@ -392,7 +392,7 @@ void CreateTensorPrintThread(const PrintThreadCrt &ctr) {
|
|||
<< MsContext::GetInstance()->get_param<uint32_t>(MS_CTX_TSD_REF) << ".";
|
||||
std::string print_file_path = MsContext::GetInstance()->get_param<std::string>(MS_CTX_PRINT_FILE_PATH);
|
||||
g_acl_tdt_print = ctr(print_file_path, g_acl_handle);
|
||||
dataset::TdtHandle::AddHandle(&g_acl_handle, &g_acl_tdt_print);
|
||||
tdt_handle::AddHandle(&g_acl_handle, &g_acl_tdt_print);
|
||||
}
|
||||
|
||||
void DestroyTensorPrintThread() {
|
||||
|
@ -419,7 +419,7 @@ void DestroyTensorPrintThread() {
|
|||
MS_LOG(ERROR) << "Failed destroy acl channel and the destroyed_status is " << destroyed_status << std::endl;
|
||||
return;
|
||||
}
|
||||
dataset::TdtHandle::DelHandle(&g_acl_handle);
|
||||
tdt_handle::DelHandle(&g_acl_handle);
|
||||
MS_LOG(INFO) << "Succeed destroy acl channel";
|
||||
}
|
||||
} // namespace mindspore::device::ascend
|
||||
|
|
|
@ -297,12 +297,10 @@ bool AscendDeprecatedInterface::OpenTsd(const std::shared_ptr<MsContext> &ms_con
|
|||
MS_LOG(EXCEPTION) << "Device " << device_id << " call rtSetDevice failed, ret[" << static_cast<int>(ret) << "]";
|
||||
}
|
||||
ms_context_ptr->increase_param<uint32_t>(MS_CTX_TSD_REF);
|
||||
#ifdef ENABLE_TDTQUE
|
||||
auto thread_crt = [](const std::string &path, const acltdtChannelHandle *acl_handle) {
|
||||
return std::thread(TensorPrint(path, acl_handle));
|
||||
};
|
||||
CreateTensorPrintThread(thread_crt);
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -315,10 +313,8 @@ bool AscendDeprecatedInterface::CloseTsd(const std::shared_ptr<MsContext> &ms_co
|
|||
ms_context_ptr->decrease_param<uint32_t>(MS_CTX_TSD_REF);
|
||||
if (force || ms_context_ptr->get_param<uint32_t>(MS_CTX_TSD_REF) == 0) {
|
||||
ms_context_ptr->set_param<uint32_t>(MS_CTX_TSD_REF, 0);
|
||||
#ifdef ENABLE_TDTQUE
|
||||
pybind11::gil_scoped_release gil_release;
|
||||
DestroyTensorPrintThread();
|
||||
#endif
|
||||
if (ErrorManager::GetInstance().Init() != 0) {
|
||||
MS_LOG(WARNING) << "Init ascend error manager failed, some ascend error log may be left out.";
|
||||
}
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
*/
|
||||
|
||||
#include "plugin/device/ascend/hal/hardware/ascend_device_res_manager.h"
|
||||
#include "runtime/data_queue/data_queue_mgr.h"
|
||||
#include "include/backend/data_queue/data_queue_mgr.h"
|
||||
#include "runtime/rt.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
#include "acl/acl_rt.h"
|
||||
#include "runtime/device/kernel_runtime.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "runtime/data_queue/data_queue_mgr.h"
|
||||
#include "include/backend/data_queue/data_queue_mgr.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
|
|
@ -15,28 +15,55 @@
|
|||
*/
|
||||
#include "plugin/device/gpu/hal/device/gpu_data_queue.h"
|
||||
#include <string>
|
||||
#include "plugin/device/gpu/hal/device/queue_common.h"
|
||||
#include <utility>
|
||||
#include "include/backend/data_queue/data_queue_mgr.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "plugin/device/gpu/hal/device/queue_common.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_device_manager.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace {
|
||||
void SetThreadToDeviceId(uint32_t device_id) {
|
||||
// Without cudaSetDevice cuda memory will allocate on GPU:0 as default
|
||||
// and will overload in distribute scenario.
|
||||
auto ret = cudaSetDevice(device_id);
|
||||
if (ret != cudaSuccess) {
|
||||
MS_LOG(EXCEPTION) << "cudaSetDevice failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<void> AllocHostMemory(size_t size) {
|
||||
void *ptr;
|
||||
auto ret = cudaHostAlloc(&ptr, size, cudaHostAllocDefault);
|
||||
if (ret != cudaSuccess) {
|
||||
MS_LOG(EXCEPTION) << "cudaHostAlloc failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
|
||||
}
|
||||
|
||||
return std::shared_ptr<void>(ptr, [](void *ptr) -> void {
|
||||
if (ptr != nullptr) {
|
||||
(void)cudaFreeHost(ptr);
|
||||
}
|
||||
});
|
||||
}
|
||||
} // namespace
|
||||
GpuDataQueueDynamic::GpuDataQueueDynamic(const size_t capacity) : DataQueue(capacity), node_info_(nullptr) {
|
||||
node_info_ = std::make_unique<NodeInfo[]>(capacity);
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
const std::string &device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
device_context_ = DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
|
||||
device_id_ = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
device_context_ = DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id_});
|
||||
device_context_->Initialize();
|
||||
stream_ = reinterpret_cast<cudaStream_t>(gpu::GPUDeviceManager::GetInstance().default_stream());
|
||||
}
|
||||
|
||||
BlockQueueStatus_T GpuDataQueueDynamic::Push(std::vector<DataQueueItem> data) {
|
||||
DataQueueStatus GpuDataQueueDynamic::Push(std::vector<DataQueueItem> data) {
|
||||
for (size_t i = 0; i < data.size(); i++) {
|
||||
auto &item = data[i];
|
||||
if (item.data_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Invalid Input: ptr: " << item.data_ptr_ << ", len: " << item.data_len_;
|
||||
return ERROR_INPUT;
|
||||
return DataQueueStatus::ERROR_INPUT;
|
||||
}
|
||||
void *addr = device_context_->device_res_manager_->AllocateMemory(item.data_len_);
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(addr, item.data_ptr_, item.data_len_, cudaMemcpyHostToDevice, stream_),
|
||||
|
@ -47,42 +74,49 @@ BlockQueueStatus_T GpuDataQueueDynamic::Push(std::vector<DataQueueItem> data) {
|
|||
node_info_[tail_].event_.reset(new cudaEvent_t());
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&(*(node_info_[tail_].event_))), "Cuda Create Event Failed");
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(*(node_info_[tail_].event_), stream_), "Cuda Create Event Failed");
|
||||
node_info_[tail_].data_ = data;
|
||||
node_info_[tail_].data_ = std::move(data);
|
||||
tail_ = (tail_ + 1) % (capacity_);
|
||||
++size_;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T GpuDataQueueDynamic::Front(std::vector<DataQueueItem> *data) const {
|
||||
DataQueueStatus GpuDataQueueDynamic::Front(std::vector<DataQueueItem> *data) const {
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(*(node_info_[head_].event_)), "Cuda Event Syn Failed");
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(*(node_info_[head_].event_)), "Cuda Destroy Event Failed");
|
||||
for (auto &item : node_info_[head_].data_) {
|
||||
host_release_(item.data_ptr_, item.worker_id_);
|
||||
}
|
||||
*data = node_info_[head_].data_;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T GpuDataQueueDynamic::Pop() {
|
||||
DataQueueStatus GpuDataQueueDynamic::Pop() {
|
||||
head_ = (head_ + 1) % (capacity_);
|
||||
--size_;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
bool GpuDataQueueDynamic::Destroy() { return true; }
|
||||
|
||||
GpuQueue::GpuQueue(void *addr, const std::vector<size_t> &shape, const size_t &capacity)
|
||||
void GpuDataQueueDynamic::SetThreadDevice() { SetThreadToDeviceId(device_id_); }
|
||||
|
||||
std::shared_ptr<void> GpuDataQueueDynamic::AllocHostMem(size_t size) { return AllocHostMemory(size); }
|
||||
|
||||
GpuQueue::GpuQueue(size_t capacity, void *addr, const std::vector<size_t> &shape)
|
||||
: DataQueue(capacity), buffer_(addr), shape_(shape), len_(0), stream_(0), node_info_(nullptr) {
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaStreamCreate(&stream_), "Cuda Create Stream Failed");
|
||||
node_info_ = std::make_unique<NodeInfo[]>(capacity);
|
||||
for (auto item : shape) {
|
||||
len_ += item;
|
||||
}
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
device_id_ = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
}
|
||||
|
||||
GpuQueue::~GpuQueue() { buffer_ = nullptr; }
|
||||
|
||||
BlockQueueStatus_T GpuQueue::Push(std::vector<DataQueueItem> data) {
|
||||
DataQueueStatus GpuQueue::Push(std::vector<DataQueueItem> data) {
|
||||
void *addr = reinterpret_cast<uint8_t *>(buffer_) + tail_ * len_;
|
||||
for (size_t i = 0; i < data.size(); i++) {
|
||||
auto &item = data[i];
|
||||
|
@ -97,7 +131,7 @@ BlockQueueStatus_T GpuQueue::Push(std::vector<DataQueueItem> data) {
|
|||
<< shape_[i] << "), "
|
||||
<< "you need to call network.set_inputs() to "
|
||||
"configure dynamic dims of input data before running the network";
|
||||
return ERROR_INPUT;
|
||||
return DataQueueStatus::ERROR_INPUT;
|
||||
}
|
||||
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(addr, item.data_ptr_, item.data_len_, cudaMemcpyHostToDevice, stream_),
|
||||
|
@ -109,26 +143,26 @@ BlockQueueStatus_T GpuQueue::Push(std::vector<DataQueueItem> data) {
|
|||
node_info_[tail_].event_.reset(new cudaEvent_t());
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&(*(node_info_[tail_].event_))), "Cuda Create Event Failed");
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(*(node_info_[tail_].event_), stream_), "Cuda Create Event Failed");
|
||||
node_info_[tail_].data_ = data;
|
||||
node_info_[tail_].data_ = std::move(data);
|
||||
tail_ = (tail_ + 1) % (capacity_);
|
||||
++size_;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T GpuQueue::Front(std::vector<DataQueueItem> *data) const {
|
||||
DataQueueStatus GpuQueue::Front(std::vector<DataQueueItem> *data) const {
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(*(node_info_[head_].event_)), "Cuda Event Syn Failed");
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(*(node_info_[head_].event_)), "Cuda Destroy Event Failed");
|
||||
for (auto &item : node_info_[head_].data_) {
|
||||
host_release_(item.data_ptr_, item.worker_id_);
|
||||
}
|
||||
*data = node_info_[head_].data_;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T GpuQueue::Pop() {
|
||||
DataQueueStatus GpuQueue::Pop() {
|
||||
head_ = (head_ + 1) % (capacity_);
|
||||
--size_;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
bool GpuQueue::Destroy() {
|
||||
|
@ -143,5 +177,22 @@ bool GpuQueue::Destroy() {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void GpuQueue::SetThreadDevice() { SetThreadToDeviceId(device_id_); }
|
||||
|
||||
std::shared_ptr<void> GpuQueue::AllocHostMem(size_t size) { return AllocHostMemory(size); }
|
||||
|
||||
namespace {
|
||||
std::shared_ptr<DataQueue> CreateGpuDataQueue(const std::string &, bool dynamic_shape, size_t capacity, void *addr,
|
||||
const std::vector<size_t> &shape) {
|
||||
if (dynamic_shape) {
|
||||
return std::make_shared<GpuDataQueueDynamic>(capacity);
|
||||
}
|
||||
|
||||
return std::make_shared<GpuQueue>(capacity, addr, shape);
|
||||
}
|
||||
|
||||
REGISTER_DATA_QUEUE_CREATOR(kGPUDevice, CreateGpuDataQueue);
|
||||
} // namespace
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
#include <memory>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include "runtime/data_queue/data_queue.h"
|
||||
#include "include/backend/data_queue/data_queue.h"
|
||||
#include "runtime/hardware/device_context_manager.h"
|
||||
#include "include/backend/visible.h"
|
||||
|
||||
|
@ -31,12 +31,15 @@ namespace device {
|
|||
class BACKEND_EXPORT GpuDataQueueDynamic : public DataQueue {
|
||||
public:
|
||||
explicit GpuDataQueueDynamic(const size_t capacity);
|
||||
virtual ~GpuDataQueueDynamic() = default;
|
||||
~GpuDataQueueDynamic() override = default;
|
||||
|
||||
BlockQueueStatus_T Push(std::vector<DataQueueItem> data);
|
||||
BlockQueueStatus_T Front(std::vector<DataQueueItem> *data) const;
|
||||
BlockQueueStatus_T Pop();
|
||||
bool Destroy();
|
||||
DataQueueStatus Push(std::vector<DataQueueItem> data) override;
|
||||
DataQueueStatus Front(std::vector<DataQueueItem> *data) const override;
|
||||
DataQueueStatus Pop() override;
|
||||
bool Destroy() override;
|
||||
|
||||
void SetThreadDevice() override;
|
||||
std::shared_ptr<void> AllocHostMem(size_t size) override;
|
||||
|
||||
private:
|
||||
struct NodeInfo {
|
||||
|
@ -48,17 +51,21 @@ class BACKEND_EXPORT GpuDataQueueDynamic : public DataQueue {
|
|||
|
||||
cudaStream_t stream_;
|
||||
std::unique_ptr<NodeInfo[]> node_info_;
|
||||
uint32_t device_id_;
|
||||
};
|
||||
|
||||
class BACKEND_EXPORT GpuQueue : public DataQueue {
|
||||
public:
|
||||
GpuQueue(void *addr, const std::vector<size_t> &shape, const size_t &capacity);
|
||||
virtual ~GpuQueue();
|
||||
GpuQueue(size_t capacity, void *addr, const std::vector<size_t> &shape);
|
||||
~GpuQueue() override;
|
||||
|
||||
BlockQueueStatus_T Push(std::vector<DataQueueItem> data);
|
||||
BlockQueueStatus_T Front(std::vector<DataQueueItem> *data) const;
|
||||
BlockQueueStatus_T Pop();
|
||||
bool Destroy();
|
||||
DataQueueStatus Push(std::vector<DataQueueItem> data) override;
|
||||
DataQueueStatus Front(std::vector<DataQueueItem> *data) const override;
|
||||
DataQueueStatus Pop() override;
|
||||
bool Destroy() override;
|
||||
|
||||
void SetThreadDevice() override;
|
||||
std::shared_ptr<void> AllocHostMem(size_t size) override;
|
||||
|
||||
private:
|
||||
struct NodeInfo {
|
||||
|
@ -73,6 +80,7 @@ class BACKEND_EXPORT GpuQueue : public DataQueue {
|
|||
cudaStream_t stream_;
|
||||
std::unique_ptr<NodeInfo[]> node_info_;
|
||||
bool ds_detected_{false};
|
||||
uint32_t device_id_;
|
||||
};
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
#include "plugin/device/gpu/hal/device/gpu_device_address.h"
|
||||
#include "plugin/device/gpu/hal/device/cuda_driver.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_event.h"
|
||||
#include "runtime/data_queue/data_queue_mgr.h"
|
||||
#include "include/backend/data_queue/data_queue_mgr.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_device_manager.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
|
||||
#include "plugin/device/gpu/hal/device/distribution/collective_init.h"
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
#include "plugin/device/gpu/hal/device/distribution/collective_init.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_device_manager.h"
|
||||
#include "plugin/device/gpu/hal/hardware/gpu_somas.h"
|
||||
#include "runtime/data_queue/data_queue_mgr.h"
|
||||
#include "include/backend/data_queue/data_queue_mgr.h"
|
||||
#include "kernel/common_utils.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_common.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
#include "plugin/device/gpu/kernel/data/dataset_init_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/data/dataset_utils.h"
|
||||
#include "kernel/common_utils.h"
|
||||
#include "runtime/data_queue/data_queue_mgr.h"
|
||||
#include "include/backend/data_queue/data_queue_mgr.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
|
||||
#include "include/common/utils/convert_utils.h"
|
||||
|
||||
|
@ -60,7 +60,7 @@ bool DatasetInitKernelMod::Launch(const std::vector<AddressPtr> &, const std::ve
|
|||
}
|
||||
|
||||
auto status = DataQueueMgr::GetInstance().Create(queue_name_, addr, shapes_, buffer_q_capacity_);
|
||||
if (status) {
|
||||
if (status != device::DataQueueStatus::SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', init Dataset Failed. len: " << len << ", status:" << status;
|
||||
}
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
#ifndef ENABLE_SECURITY
|
||||
#include "plugin/device/gpu/hal/profiler/gpu_profiling.h"
|
||||
#endif
|
||||
#include "runtime/data_queue/data_queue_mgr.h"
|
||||
#include "include/backend/data_queue/data_queue_mgr.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_common.h"
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
#include "include/common/debug/rdr/recorder_manager.h"
|
||||
|
@ -113,7 +113,7 @@ bool DatasetIteratorKernelMod::ReadDevice(std::vector<DataQueueItem> *data) {
|
|||
}
|
||||
#endif
|
||||
auto ret = DataQueueMgr::GetInstance().Front(queue_name_, data);
|
||||
if (ret == device::SUCCESS) {
|
||||
if (ret == device::DataQueueStatus::SUCCESS) {
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (profiling_enable_) {
|
||||
uint64_t end_time_stamp = profiling_op_->GetTimeStamp();
|
||||
|
@ -123,7 +123,7 @@ bool DatasetIteratorKernelMod::ReadDevice(std::vector<DataQueueItem> *data) {
|
|||
break;
|
||||
}
|
||||
|
||||
if (ret == device::TIMEOUT) {
|
||||
if (ret == device::DataQueueStatus::TIMEOUT) {
|
||||
repeat++;
|
||||
if (repeat < 10) {
|
||||
MS_LOG(INFO) << "Waiting for data...(" << repeat << " / 10)";
|
||||
|
@ -155,7 +155,7 @@ bool DatasetIteratorKernelMod::Launch(const std::vector<AddressPtr> &, const std
|
|||
}
|
||||
if (!is_opened_) {
|
||||
auto ret = DataQueueMgr::GetInstance().OpenDynamicBufQueue(queue_name_);
|
||||
if (ret != device::BlockQueueStatus_T::SUCCESS) {
|
||||
if (ret != device::DataQueueStatus::SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', gpu Queue(" << queue_name_ << ") Open Failed: " << ret;
|
||||
}
|
||||
is_opened_ = true;
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#include "plugin/device/gpu/kernel/data/dataset_profiling.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "runtime/data_queue/blocking_queue.h"
|
||||
#include "include/backend/data_queue/blocking_queue.h"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
using mindspore::device::DataQueueItem;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2019-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -14,7 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "runtime/data_queue/blocking_queue.h"
|
||||
#include "include/backend/data_queue/blocking_queue.h"
|
||||
#include "utils/log_adapter.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -22,60 +22,60 @@ namespace device {
|
|||
const size_t kTimeout = 100;
|
||||
void BlockingQueue::RegisterRelease(const std::function<void(void *, int32_t)> &func) { queue_->RegisterRelease(func); }
|
||||
|
||||
BlockQueueStatus_T BlockingQueue::Push(const std::vector<DataQueueItem> &data, unsigned int) {
|
||||
DataQueueStatus BlockingQueue::Push(const std::vector<DataQueueItem> &data, unsigned int) {
|
||||
std::unique_lock<std::mutex> locker(mutex_);
|
||||
if (queue_->IsFull()) {
|
||||
if (not_full_cond_.wait_for(locker, std::chrono::microseconds(kTimeout)) == std::cv_status::timeout) {
|
||||
return TIMEOUT;
|
||||
return DataQueueStatus::TIMEOUT;
|
||||
}
|
||||
}
|
||||
auto ret = queue_->Push(data);
|
||||
if (ret != SUCCESS) {
|
||||
if (ret != DataQueueStatus::SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
not_empty_cond_.notify_one();
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T BlockingQueue::Front(std::vector<DataQueueItem> *data) {
|
||||
DataQueueStatus BlockingQueue::Front(std::vector<DataQueueItem> *data) {
|
||||
std::unique_lock<std::mutex> locker(mutex_);
|
||||
bool timeout = not_empty_cond_.wait_for(locker, std::chrono::seconds(30), [this] { return !queue_->IsEmpty(); });
|
||||
if (!timeout) {
|
||||
return TIMEOUT;
|
||||
return DataQueueStatus::TIMEOUT;
|
||||
}
|
||||
return queue_->Front(data);
|
||||
}
|
||||
|
||||
BlockQueueStatus_T BlockingQueue::Pop() {
|
||||
DataQueueStatus BlockingQueue::Pop() {
|
||||
std::unique_lock<std::mutex> locker(mutex_);
|
||||
not_empty_cond_.wait(locker, [this] { return !queue_->IsEmpty(); });
|
||||
auto ret = queue_->Pop();
|
||||
if (ret != SUCCESS) {
|
||||
if (ret != DataQueueStatus::SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
not_full_cond_.notify_one();
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T BlockingQueue::Create(const std::shared_ptr<DataQueue> &data_queue) {
|
||||
DataQueueStatus BlockingQueue::Create(const std::shared_ptr<DataQueue> &data_queue) {
|
||||
this->queue_ = data_queue;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T BlockingQueue::Clear() {
|
||||
DataQueueStatus BlockingQueue::Clear() {
|
||||
std::unique_lock<std::mutex> locker(mutex_);
|
||||
while (Size() > 0) {
|
||||
std::vector<DataQueueItem> data;
|
||||
auto ret = queue_->Front(&data);
|
||||
if (ret != SUCCESS) {
|
||||
if (ret != DataQueueStatus::SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
ret = queue_->Pop();
|
||||
if (ret != SUCCESS) {
|
||||
if (ret != DataQueueStatus::SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
bool BlockingQueue::Destroy() {
|
||||
|
|
|
@ -14,9 +14,11 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "runtime/data_queue/data_queue.h"
|
||||
#include "include/backend/data_queue/data_queue.h"
|
||||
#include <string>
|
||||
#include "utils/ms_context.h"
|
||||
#include "runtime/hardware/device_context_manager.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
DataQueue::DataQueue(const size_t capacity)
|
||||
|
|
|
@ -14,18 +14,16 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "runtime/data_queue/data_queue_mgr.h"
|
||||
#if ENABLE_GPU
|
||||
#include "plugin/device/gpu/hal/device/gpu_data_queue.h"
|
||||
#elif ENABLE_D
|
||||
#include "plugin/device/ascend/hal/device/ascend_data_queue.h"
|
||||
#endif
|
||||
#include "include/backend/data_queue/data_queue_mgr.h"
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include "utils/log_adapter.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "pybind11/pybind11.h"
|
||||
#include "pybind11/stl.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
#include "include/backend/data_queue/blocking_queue.h"
|
||||
#include "runtime/device/kernel_info.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
|
@ -36,136 +34,151 @@ DataQueueMgr &DataQueueMgr::GetInstance() noexcept {
|
|||
return instance;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T DataQueueMgr::Create(const std::string &channel_name, void *addr, const std::vector<size_t> &shape,
|
||||
const size_t &capacity) {
|
||||
void DataQueueMgr::RegisterDataQueueCreator(const std::string &device_name, DataQueueCreator &&creator) {
|
||||
data_queue_creator_map_.emplace(device_name, std::forward<DataQueueCreator>(creator));
|
||||
}
|
||||
|
||||
std::shared_ptr<DataQueue> DataQueueMgr::CreateDataQueue(const std::string &device_name,
|
||||
const std::string &channel_name, bool dynamic_shape,
|
||||
size_t capacity, void *addr,
|
||||
const std::vector<size_t> &shape) {
|
||||
auto iter = data_queue_creator_map_.find(device_name);
|
||||
if (iter == data_queue_creator_map_.end()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return iter->second(channel_name, dynamic_shape, capacity, addr, shape);
|
||||
}
|
||||
|
||||
DataQueueStatus DataQueueMgr::Create(const std::string &channel_name, void *addr, const std::vector<size_t> &shape,
|
||||
const size_t &capacity) {
|
||||
MS_LOG(INFO) << "Static GPU queue: " << channel_name << " created";
|
||||
if (name_queue_map_.find(channel_name) != name_queue_map_.end()) {
|
||||
MS_LOG(ERROR) << "Queue already exist: " << channel_name;
|
||||
return QUEUE_EXIST;
|
||||
return DataQueueStatus::QUEUE_EXIST;
|
||||
}
|
||||
#if ENABLE_GPU
|
||||
std::shared_ptr<BlockingQueue> queue = std::make_shared<BlockingQueue>();
|
||||
std::shared_ptr<DataQueue> gpu_queue = std::make_shared<GpuQueue>(addr, shape, capacity);
|
||||
BlockQueueStatus_T rt = queue->Create(gpu_queue);
|
||||
if (rt != SUCCESS) {
|
||||
MS_LOG(ERROR) << "Queue: " << channel_name << "create failed: " << rt;
|
||||
return rt;
|
||||
|
||||
MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
|
||||
std::shared_ptr<DataQueue> data_queue = CreateDataQueue(
|
||||
MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET), channel_name, false, capacity, addr, shape);
|
||||
if (data_queue != nullptr) {
|
||||
std::shared_ptr<BlockingQueue> queue = std::make_shared<BlockingQueue>();
|
||||
DataQueueStatus rt = queue->Create(data_queue);
|
||||
if (rt != DataQueueStatus::SUCCESS) {
|
||||
MS_LOG(ERROR) << "Queue: " << channel_name << "create failed: " << rt;
|
||||
return rt;
|
||||
}
|
||||
(void)name_queue_map_.insert(std::make_pair(channel_name, queue));
|
||||
init_ = true;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
(void)name_queue_map_.insert(std::make_pair(channel_name, queue));
|
||||
init_ = true;
|
||||
return SUCCESS;
|
||||
#else
|
||||
|
||||
MS_LOG(ERROR) << "Static data queue only support GPU target.";
|
||||
return INTERNAL_ERROR;
|
||||
#endif
|
||||
return DataQueueStatus::INTERNAL_ERROR;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T DataQueueMgr::Open(const std::string &channel_name,
|
||||
const std::function<void(void *, int32_t)> func) {
|
||||
DataQueueStatus DataQueueMgr::Open(const std::string &channel_name, const std::function<void(void *, int32_t)> func) {
|
||||
MS_LOG(INFO) << "Gpu queue: " << channel_name << " open.";
|
||||
if (name_queue_map_.find(channel_name) == name_queue_map_.end()) {
|
||||
MS_LOG(ERROR) << "Queue not exist " << channel_name;
|
||||
return QUEUE_NOT_EXIST;
|
||||
return DataQueueStatus::QUEUE_NOT_EXIST;
|
||||
}
|
||||
|
||||
name_queue_map_[channel_name]->RegisterRelease(func);
|
||||
open_by_dataset_++;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
BlockQueueStatus_T DataQueueMgr::OpenDynamicBufQueue(const std::string &channel_name,
|
||||
const std::function<void(void *, int32_t)> func) {
|
||||
DataQueueStatus DataQueueMgr::OpenDynamicBufQueue(const std::string &channel_name,
|
||||
const std::function<void(void *, int32_t)> func) {
|
||||
std::unique_lock<std::mutex> locker(mutex_);
|
||||
if (name_queue_map_.find(channel_name) == name_queue_map_.end()) {
|
||||
BlockQueueStatus_T status = CreateDynamicBufQueue(channel_name, default_capacity_);
|
||||
MS_EXCEPTION_IF_CHECK_FAIL(status == BlockQueueStatus_T::SUCCESS, "Create dynamic buffer queue failed");
|
||||
DataQueueStatus status = CreateDynamicBufQueue(channel_name, default_capacity_);
|
||||
MS_EXCEPTION_IF_CHECK_FAIL(status == DataQueueStatus::SUCCESS, "Create dynamic buffer queue failed");
|
||||
MS_LOG_INFO << "Create dynamic buffer queue: " << channel_name;
|
||||
cv_.notify_all();
|
||||
}
|
||||
name_queue_map_[channel_name]->RegisterRelease(func);
|
||||
open_by_dataset_++;
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T DataQueueMgr::OpenDynamicBufQueue(const std::string &channel_name) {
|
||||
DataQueueStatus DataQueueMgr::OpenDynamicBufQueue(const std::string &channel_name) {
|
||||
std::unique_lock<std::mutex> locker(mutex_);
|
||||
auto time_out = cv_.wait_for(locker, std::chrono::seconds(MAX_WAIT_TIME_IN_SEC),
|
||||
[this, &channel_name] { return name_queue_map_.count(channel_name); });
|
||||
if (!time_out) {
|
||||
return TIMEOUT;
|
||||
return DataQueueStatus::TIMEOUT;
|
||||
}
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T DataQueueMgr::CreateDynamicBufQueue(const std::string &channel_name, const size_t &capacity) {
|
||||
DataQueueStatus DataQueueMgr::CreateDynamicBufQueue(const std::string &channel_name, const size_t &capacity) {
|
||||
if (name_queue_map_.find(channel_name) != name_queue_map_.end()) {
|
||||
MS_LOG(ERROR) << "Queue already exist: " << channel_name;
|
||||
return QUEUE_EXIST;
|
||||
return DataQueueStatus::QUEUE_EXIST;
|
||||
}
|
||||
#if ENABLE_GPU
|
||||
std::shared_ptr<BlockingQueue> queue = std::make_shared<BlockingQueue>();
|
||||
std::shared_ptr<DataQueue> device_queue = std::make_shared<GpuDataQueueDynamic>(capacity);
|
||||
#elif ENABLE_D
|
||||
std::shared_ptr<BlockingQueue> queue = std::make_shared<BlockingQueue>();
|
||||
std::shared_ptr<DataQueue> device_queue = std::make_shared<AscendDataQueueDynamic>(capacity);
|
||||
#else
|
||||
MS_LOG(ERROR) << "Dynamic data queue only support Ascend/GPU target.";
|
||||
return QUEUE_EXIST;
|
||||
#endif
|
||||
#if ENABLE_GPU || ENABLE_D
|
||||
BlockQueueStatus_T rt = queue->Create(device_queue);
|
||||
if (rt != SUCCESS) {
|
||||
MS_LOG(ERROR) << "Queue: " << channel_name << "create failed: " << rt;
|
||||
return rt;
|
||||
MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
|
||||
std::string device_name = MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
std::shared_ptr<DataQueue> device_queue = CreateDataQueue(device_name, channel_name, true, capacity, nullptr, {});
|
||||
if (device_queue != nullptr && (device_name == kGPUDevice || device_name == kAscendDevice)) {
|
||||
std::shared_ptr<BlockingQueue> queue = std::make_shared<BlockingQueue>();
|
||||
DataQueueStatus rt = queue->Create(device_queue);
|
||||
if (rt != DataQueueStatus::SUCCESS) {
|
||||
MS_LOG(ERROR) << "Queue: " << channel_name << "create failed: " << rt;
|
||||
return rt;
|
||||
}
|
||||
(void)name_queue_map_.insert(std::make_pair(channel_name, queue));
|
||||
init_ = true;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
(void)name_queue_map_.insert(std::make_pair(channel_name, queue));
|
||||
init_ = true;
|
||||
return SUCCESS;
|
||||
#endif
|
||||
|
||||
MS_LOG(ERROR) << "Dynamic data queue only support Ascend/GPU target, bug got " << device_name;
|
||||
return DataQueueStatus::QUEUE_EXIST;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T DataQueueMgr::Open(const std::string &channel_name) const {
|
||||
DataQueueStatus DataQueueMgr::Open(const std::string &channel_name) const {
|
||||
if (name_queue_map_.find(channel_name) == name_queue_map_.end()) {
|
||||
MS_LOG(ERROR) << "Queue not exist " << channel_name;
|
||||
return QUEUE_NOT_EXIST;
|
||||
return DataQueueStatus::QUEUE_NOT_EXIST;
|
||||
}
|
||||
return SUCCESS;
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
BlockQueueStatus_T DataQueueMgr::Push(const std::string &channel_name, const std::vector<DataQueueItem> &data,
|
||||
unsigned int timeout_in_sec) {
|
||||
DataQueueStatus DataQueueMgr::Push(const std::string &channel_name, const std::vector<DataQueueItem> &data,
|
||||
unsigned int timeout_in_sec) {
|
||||
auto iter = name_queue_map_.find(channel_name);
|
||||
if (iter == name_queue_map_.end()) {
|
||||
MS_LOG(ERROR) << "Queue not exist " << channel_name;
|
||||
return QUEUE_NOT_EXIST;
|
||||
return DataQueueStatus::QUEUE_NOT_EXIST;
|
||||
}
|
||||
return iter->second->Push(data, timeout_in_sec);
|
||||
}
|
||||
|
||||
BlockQueueStatus_T DataQueueMgr::Front(const std::string &channel_name, std::vector<DataQueueItem> *data) {
|
||||
DataQueueStatus DataQueueMgr::Front(const std::string &channel_name, std::vector<DataQueueItem> *data) {
|
||||
auto iter = name_queue_map_.find(channel_name);
|
||||
if (iter == name_queue_map_.end()) {
|
||||
MS_LOG(ERROR) << "Queue not exist " << channel_name;
|
||||
return QUEUE_NOT_EXIST;
|
||||
return DataQueueStatus::QUEUE_NOT_EXIST;
|
||||
}
|
||||
|
||||
return iter->second->Front(data);
|
||||
}
|
||||
|
||||
BlockQueueStatus_T DataQueueMgr::Pop(const std::string &channel_name) {
|
||||
DataQueueStatus DataQueueMgr::Pop(const std::string &channel_name) {
|
||||
auto iter = name_queue_map_.find(channel_name);
|
||||
if (iter == name_queue_map_.end()) {
|
||||
MS_LOG(ERROR) << "Queue not exist " << channel_name;
|
||||
return QUEUE_NOT_EXIST;
|
||||
return DataQueueStatus::QUEUE_NOT_EXIST;
|
||||
}
|
||||
|
||||
return iter->second->Pop();
|
||||
}
|
||||
|
||||
BlockQueueStatus_T DataQueueMgr::Clear(const std::string &channel_name) {
|
||||
DataQueueStatus DataQueueMgr::Clear(const std::string &channel_name) {
|
||||
auto iter = name_queue_map_.find(channel_name);
|
||||
if (iter == name_queue_map_.end()) {
|
||||
MS_LOG(ERROR) << "Queue not exist " << channel_name;
|
||||
return QUEUE_NOT_EXIST;
|
||||
return DataQueueStatus::QUEUE_NOT_EXIST;
|
||||
}
|
||||
|
||||
return iter->second->Clear();
|
||||
|
@ -238,11 +251,39 @@ size_t DataQueueMgr::Capacity(const std::string &channel_name) {
|
|||
return name_queue_map_.at(channel_name)->Capacity();
|
||||
}
|
||||
|
||||
std::shared_ptr<DataQueue> DataQueueMgr::GetDataQueue(const std::string &channel_name) const {
|
||||
auto iter = name_queue_map_.find(channel_name);
|
||||
if (iter == name_queue_map_.end()) {
|
||||
MS_LOG(ERROR) << "Queue not exist " << channel_name;
|
||||
return nullptr;
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(iter->second);
|
||||
return iter->second->Queue();
|
||||
}
|
||||
|
||||
DataQueueStatus DataQueueMgr::SetThreadDevice(const std::string &device_name) {
|
||||
auto tmp_queue = CreateDataQueue(device_name, {}, false, 0, nullptr, {});
|
||||
if (tmp_queue == nullptr) {
|
||||
return DataQueueStatus::QUEUE_NOT_EXIST;
|
||||
}
|
||||
tmp_queue->SetThreadDevice();
|
||||
return DataQueueStatus::SUCCESS;
|
||||
}
|
||||
|
||||
std::shared_ptr<void> DataQueueMgr::AllocHostMem(const std::string &device_name, size_t size) {
|
||||
auto tmp_queue = CreateDataQueue(device_name, {}, false, 0, nullptr, {});
|
||||
if (tmp_queue == nullptr) {
|
||||
return std::shared_ptr<void>(::malloc(size), ::free);
|
||||
}
|
||||
|
||||
return tmp_queue->AllocHostMem(size);
|
||||
}
|
||||
#ifndef BUILD_LITE
|
||||
bool PopDataFromDataQueue(const AnfNodePtr &data_kernel) {
|
||||
auto queue_name = common::AnfAlgo::GetNodeAttr<std::string>(data_kernel, "shared_name");
|
||||
device::DataQueueMgr &buf_mgr = device::DataQueueMgr::GetInstance();
|
||||
auto ret = buf_mgr.OpenDynamicBufQueue(queue_name);
|
||||
MS_EXCEPTION_IF_CHECK_FAIL(ret == device::BlockQueueStatus_T::SUCCESS, "Open dynamic data queue failed");
|
||||
MS_EXCEPTION_IF_CHECK_FAIL(ret == device::DataQueueStatus::SUCCESS, "Open dynamic data queue failed");
|
||||
std::vector<device::DataQueueItem> data;
|
||||
auto kernel_info = dynamic_cast<device::KernelInfo *>(data_kernel->kernel_info());
|
||||
(void)buf_mgr.Front(queue_name, &data);
|
||||
|
@ -271,29 +312,6 @@ bool PopDataFromDataQueue(const AnfNodePtr &data_kernel) {
|
|||
common::AnfAlgo::SetOutputInferTypeAndShape(types, shapes, data_kernel.get());
|
||||
return true;
|
||||
}
|
||||
namespace {
|
||||
struct DataQueueHandlerRegister {
|
||||
DataQueueHandlerRegister() noexcept {
|
||||
DataQueueHandler::SetOpenHandler(
|
||||
[](const std::string channel_name, const std::function<void(void *, int32_t)> func) -> BlockQueueStatus_T {
|
||||
return DataQueueMgr::GetInstance().Open(channel_name, func);
|
||||
});
|
||||
DataQueueHandler::SetOpenDynamicBufQueueHandler(
|
||||
[](const std::string &channel_name, const std::function<void(void *, int32_t)> func) -> BlockQueueStatus_T {
|
||||
return DataQueueMgr::GetInstance().OpenDynamicBufQueue(channel_name, func);
|
||||
});
|
||||
DataQueueHandler::SetIsClosedHandler([]() -> bool { return DataQueueMgr::GetInstance().IsClosed(); });
|
||||
DataQueueHandler::SetCloseConfirmHandler([]() { DataQueueMgr::GetInstance().CloseConfirm(); });
|
||||
DataQueueHandler::SetCloseHandler([](const std::string &channel) { DataQueueMgr::GetInstance().Close(channel); });
|
||||
DataQueueHandler::SetClearHandler([](const std::string &channel) -> device::BlockQueueStatus_T {
|
||||
return DataQueueMgr::GetInstance().Clear(channel);
|
||||
});
|
||||
DataQueueHandler::SetPushHandler([](const std::string &channel, const std::vector<device::DataQueueItem> &items,
|
||||
unsigned int timeout) -> device::BlockQueueStatus_T {
|
||||
return DataQueueMgr::GetInstance().Push(channel, items, timeout);
|
||||
});
|
||||
}
|
||||
} callback_register;
|
||||
} // namespace
|
||||
#endif
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,131 +0,0 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_DATA_QUEUE_MGR_H_
|
||||
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_DATA_QUEUE_MGR_H_
|
||||
|
||||
#include <unistd.h>
|
||||
#include <iostream>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include "runtime/data_queue/blocking_queue.h"
|
||||
|
||||
#define EXPORT __attribute__((visibility("default")))
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
static const unsigned int MAX_WAIT_TIME_IN_SEC = 60;
|
||||
|
||||
class Semaphore {
|
||||
public:
|
||||
explicit Semaphore(int count = 0) : count_(count) {}
|
||||
|
||||
inline void Signal() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
++count_;
|
||||
cv_.notify_one();
|
||||
}
|
||||
|
||||
inline bool Wait() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
while (count_ == 0) {
|
||||
if (cv_.wait_for(lock, std::chrono::seconds(MAX_WAIT_TIME_IN_SEC)) == std::cv_status::timeout) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
--count_;
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex mutex_;
|
||||
std::condition_variable cv_;
|
||||
int count_;
|
||||
};
|
||||
|
||||
class DataQueueMgr {
|
||||
public:
|
||||
EXPORT DataQueueMgr() : cur_dev_id_(0), init_(false), closed_(false), open_by_dataset_(0) {}
|
||||
|
||||
EXPORT virtual ~DataQueueMgr() = default;
|
||||
|
||||
EXPORT static DataQueueMgr &GetInstance() noexcept;
|
||||
|
||||
EXPORT BlockQueueStatus_T Create(const std::string &channel_name, void *addr, const std::vector<size_t> &shape,
|
||||
const size_t &capacity);
|
||||
|
||||
// call for Push thread
|
||||
EXPORT BlockQueueStatus_T Open(const std::string &channel_name, std::function<void(void *, int32_t)> func);
|
||||
|
||||
// call for Front/Pop thread
|
||||
EXPORT BlockQueueStatus_T Open(const std::string &channel_name) const;
|
||||
EXPORT BlockQueueStatus_T Push(const std::string &channel_name, const std::vector<DataQueueItem> &data,
|
||||
unsigned int timeout_in_sec);
|
||||
EXPORT BlockQueueStatus_T Front(const std::string &channel_name, std::vector<DataQueueItem> *data);
|
||||
EXPORT BlockQueueStatus_T Pop(const std::string &channel_name);
|
||||
EXPORT BlockQueueStatus_T Clear(const std::string &channel_name);
|
||||
|
||||
EXPORT BlockQueueStatus_T OpenDynamicBufQueue(const std::string &channel_name,
|
||||
const std::function<void(void *, int32_t)> func);
|
||||
EXPORT BlockQueueStatus_T CreateDynamicBufQueue(const std::string &channel_name, const size_t &capacity);
|
||||
EXPORT BlockQueueStatus_T OpenDynamicBufQueue(const std::string &channel_name);
|
||||
|
||||
EXPORT void Close(const std::string &channel_name) const noexcept;
|
||||
|
||||
EXPORT bool IsInit() const;
|
||||
|
||||
EXPORT bool IsClosed() const;
|
||||
|
||||
EXPORT bool Destroy();
|
||||
|
||||
// call for Release GPU Resources
|
||||
EXPORT bool CloseNotify();
|
||||
|
||||
// call for dataset send thread
|
||||
EXPORT void CloseConfirm();
|
||||
|
||||
EXPORT size_t Size(const std::string &channel_name);
|
||||
|
||||
EXPORT size_t Capacity(const std::string &channel_name);
|
||||
|
||||
private:
|
||||
int cur_dev_id_;
|
||||
bool init_;
|
||||
bool closed_;
|
||||
std::mutex mutex_;
|
||||
std::mutex close_mutex_;
|
||||
std::condition_variable cv_;
|
||||
// how many queues opened by dataset
|
||||
int open_by_dataset_;
|
||||
Semaphore sema;
|
||||
bool dynamic_shape_{false};
|
||||
size_t default_capacity_{2};
|
||||
|
||||
std::map<std::string, std::shared_ptr<BlockingQueue>> name_queue_map_;
|
||||
|
||||
inline bool isCreated(const std::string &channel_name) const;
|
||||
|
||||
DataQueueMgr(const DataQueueMgr &) = delete;
|
||||
DataQueueMgr &operator=(const DataQueueMgr &) = delete;
|
||||
};
|
||||
bool PopDataFromDataQueue(const AnfNodePtr &data_kernel);
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_BUFFER_MGR_H_
|
|
@ -11,10 +11,5 @@ if("${ENABLE_HIDDEN}" STREQUAL "OFF")
|
|||
string(REPLACE " -fvisibility=hidden" " -fvisibility=default" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||
endif()
|
||||
|
||||
if(ENABLE_TDTQUE)
|
||||
file(GLOB_RECURSE TDT_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"../../minddata/dataset/engine/tdt/tdt_handle.cc")
|
||||
endif()
|
||||
|
||||
set_property(SOURCE ${DEVICE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
|
||||
add_library(_mindspore_runtime_device_obj OBJECT ${DEVICE_SRC_LIST} ${TDT_SRC_LIST})
|
||||
add_library(_mindspore_runtime_device_obj OBJECT ${DEVICE_SRC_LIST})
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "utils/data_queue_handler.h"
|
||||
namespace {
|
||||
// empty source file trick to avoid symbol ex/import problem on Windows
|
||||
} // namespace
|
|
@ -1,50 +0,0 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_INCLUDE_COMMON_UTILS_DATA_QUEUE_HANDLER_H_
|
||||
#define MINDSPORE_CCSRC_INCLUDE_COMMON_UTILS_DATA_QUEUE_HANDLER_H_
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
#include "utils/macros.h"
|
||||
#include "utils/callback_handler.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
enum BlockQueueStatus_T : int { SUCCESS = 0, QUEUE_EXIST, QUEUE_NOT_EXIST, ERROR_INPUT, INTERNAL_ERROR, TIMEOUT };
|
||||
struct DataQueueItem {
|
||||
int32_t worker_id_{0};
|
||||
std::string data_type_;
|
||||
size_t data_len_{0};
|
||||
void *data_ptr_{nullptr};
|
||||
std::vector<int64_t> shapes_;
|
||||
void *device_addr_{nullptr};
|
||||
};
|
||||
} // namespace device
|
||||
class MS_EXPORT DataQueueHandler {
|
||||
HANDLER_DEFINE(device::BlockQueueStatus_T, OpenDynamicBufQueue, const std::string &,
|
||||
const std::function<void(void *, int32_t)>);
|
||||
HANDLER_DEFINE(device::BlockQueueStatus_T, Open, const std::string &, const std::function<void(void *, int32_t)>);
|
||||
HANDLER_DEFINE(bool, IsClosed);
|
||||
HANDLER_DEFINE(void, CloseConfirm);
|
||||
HANDLER_DEFINE(void, Close, const std::string &);
|
||||
HANDLER_DEFINE(device::BlockQueueStatus_T, Clear, const std::string &);
|
||||
HANDLER_DEFINE(device::BlockQueueStatus_T, Push, const std::string &, const std::vector<device::DataQueueItem> &,
|
||||
unsigned int);
|
||||
};
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_INCLUDE_COMMON_UTILS_DATA_QUEUE_HANDLER_H_
|
|
@ -39,7 +39,6 @@ constexpr auto kSplitLine = "\n-------------------------------------------------
|
|||
#if defined(__ANDROID__) || defined(ANDROID)
|
||||
constexpr const char *ANDROID_LOG_TAG = "MS_LITE";
|
||||
#endif
|
||||
std::map<void **, std::thread *> acl_handle_map;
|
||||
// set default log level to WARNING for all sub modules
|
||||
int g_ms_submodule_log_levels[NUM_SUBMODUES] = {WARNING};
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
|
|
|
@ -47,7 +47,6 @@ static constexpr size_t GetRelPathPos() noexcept {
|
|||
}
|
||||
namespace mindspore {
|
||||
/// \brief The handler map for ACL.
|
||||
MS_CORE_API extern std::map<void **, std::thread *> acl_handle_map;
|
||||
#define FILE_NAME \
|
||||
(sizeof(__FILE__) > GetRelPathPos() ? static_cast<const char *>(__FILE__) + GetRelPathPos() \
|
||||
: static_cast<const char *>(__FILE__))
|
||||
|
|
|
@ -53,7 +53,7 @@ TEST_F(TestCompileSegmentRunner, test_MsVmConvert1) {
|
|||
std::shared_ptr<mindspore::FuncGraphManager> manager = mindspore::Manage(g);
|
||||
|
||||
BackendPtr b = std::make_shared<Backend>("vm");
|
||||
auto graph_partition = std::make_shared<GraphPartition>(nonlinear_ops, b->name());
|
||||
auto graph_partition = std::make_shared<GraphPartition>(GetNonlinearOps(), b->name());
|
||||
auto segments = graph_partition->Partition(g);
|
||||
VectorRef args({1.0, 2.0});
|
||||
|
||||
|
@ -67,7 +67,7 @@ TEST_F(TestCompileSegmentRunner, test_MsVmConvert2) {
|
|||
std::shared_ptr<mindspore::FuncGraphManager> manager = mindspore::Manage(g);
|
||||
|
||||
BackendPtr b = std::make_shared<Backend>("vm");
|
||||
auto graph_partition = std::make_shared<GraphPartition>(nonlinear_ops, b->name());
|
||||
auto graph_partition = std::make_shared<GraphPartition>(GetNonlinearOps(), b->name());
|
||||
auto segments = graph_partition->Partition(g);
|
||||
VectorRef args({1.0, 2.0});
|
||||
|
||||
|
@ -81,7 +81,7 @@ TEST_F(TestCompileSegmentRunner, test_if) {
|
|||
std::shared_ptr<mindspore::FuncGraphManager> manager = mindspore::Manage(g);
|
||||
|
||||
BackendPtr b = std::make_shared<Backend>("vm");
|
||||
auto graph_partition = std::make_shared<GraphPartition>(nonlinear_ops, b->name());
|
||||
auto graph_partition = std::make_shared<GraphPartition>(GetNonlinearOps(), b->name());
|
||||
auto segments = graph_partition->Partition(g);
|
||||
VectorRef args({1.0, 2.0});
|
||||
|
||||
|
|
Loading…
Reference in New Issue