forked from mindspore-Ecosystem/mindspore
!15323 sub graph split support gpu
From: @ling_qiao_min Reviewed-by: @chenzupeng,@zhanghaibo5,@zhang_xue_tong Signed-off-by: @zhang_xue_tong
This commit is contained in:
commit
858924a528
|
@ -35,7 +35,7 @@ option(ENABLE_VERBOSE "" off)
|
||||||
option(ENABLE_SSE "if x86_64 support SSE instruction set" off)
|
option(ENABLE_SSE "if x86_64 support SSE instruction set" off)
|
||||||
option(ENABLE_AVX "if x86_64 support SSE instruction set" off)
|
option(ENABLE_AVX "if x86_64 support SSE instruction set" off)
|
||||||
option(ENABLE_MINDRT "if support mindrt" on)
|
option(ENABLE_MINDRT "if support mindrt" on)
|
||||||
option(SUBGRAPH_SPLIT "if support sub graph split" off)
|
option(SUBGRAPH_SPLIT "if support sub graph split" on)
|
||||||
|
|
||||||
set(DIR_PREFIX mindspore-lite)
|
set(DIR_PREFIX mindspore-lite)
|
||||||
set(MS_VERSION ${MS_VERSION_MAJOR}.${MS_VERSION_MINOR}.${MS_VERSION_REVISION})
|
set(MS_VERSION ${MS_VERSION_MAJOR}.${MS_VERSION_MINOR}.${MS_VERSION_REVISION})
|
||||||
|
|
|
@ -133,7 +133,6 @@ set(LITE_SRC
|
||||||
${LITE_DIR}/src/common/tensor_util.cc
|
${LITE_DIR}/src/common/tensor_util.cc
|
||||||
${LITE_DIR}/src/runtime/infer_manager.cc
|
${LITE_DIR}/src/runtime/infer_manager.cc
|
||||||
${LITE_DIR}/src/lite_model.cc
|
${LITE_DIR}/src/lite_model.cc
|
||||||
${LITE_DIR}/src/sub_graph_split.cc
|
|
||||||
${LITE_DIR}/src/tensorlist.cc
|
${LITE_DIR}/src/tensorlist.cc
|
||||||
${LITE_DIR}/src/tensor.cc
|
${LITE_DIR}/src/tensor.cc
|
||||||
${LITE_DIR}/src/weight_decoder.cc
|
${LITE_DIR}/src/weight_decoder.cc
|
||||||
|
|
|
@ -33,15 +33,7 @@
|
||||||
#include "include/context.h"
|
#include "include/context.h"
|
||||||
|
|
||||||
namespace mindspore::kernel {
|
namespace mindspore::kernel {
|
||||||
enum KERNEL_ARCH {
|
enum KERNEL_ARCH { kCPU, kGPU, kAPU, kNPU, kKernelArch_MIN = kCPU, kKernelArch_MAX = kNPU };
|
||||||
kCPU,
|
|
||||||
kGPU,
|
|
||||||
kAPU,
|
|
||||||
kNPU,
|
|
||||||
kALL, /* Support GPU NPU CPU */
|
|
||||||
kKernelArch_MIN = kCPU,
|
|
||||||
kKernelArch_MAX = kALL
|
|
||||||
};
|
|
||||||
|
|
||||||
struct KernelKey {
|
struct KernelKey {
|
||||||
KERNEL_ARCH arch;
|
KERNEL_ARCH arch;
|
||||||
|
|
|
@ -54,19 +54,18 @@ void LiteOpActor::AsyncOutput(OpContext<Tensor> *context) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void LiteOpActor::AddResultIndex(size_t index) {
|
||||||
|
results_index_.push_back(index);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
void LiteOpActor::SetOutputData(OpContext<Tensor> *context) {
|
void LiteOpActor::SetOutputData(OpContext<Tensor> *context) {
|
||||||
auto size = context->outputData_->size();
|
for (auto index : results_index_) {
|
||||||
MS_ASSERT(size == context->results_->size());
|
context->SetResult(index, RET_OK);
|
||||||
for (size_t i = 0; i < size; i++) {
|
|
||||||
auto outputData = context->outputData_->at(i);
|
|
||||||
if (GetAID() == outputData->op_id_) {
|
|
||||||
outputData->data_ = kernel_->out_tensors()[outputData->index_];
|
|
||||||
context->SetResult(i, RET_OK);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int MindrtInit() { return mindspore::Initialize("tcp://127.0.0.1:8080", "", "", "", 1); }
|
int MindrtInit() { return mindspore::Initialize("tcp://127.0.0.1:8080", "", "", "", 2); }
|
||||||
|
|
||||||
void MindrtTerminate(std::vector<std::shared_ptr<LiteOpActor>> actor_list) {
|
void MindrtTerminate(std::vector<std::shared_ptr<LiteOpActor>> actor_list) {
|
||||||
for (auto actor : actor_list) {
|
for (auto actor : actor_list) {
|
||||||
|
|
|
@ -42,6 +42,10 @@ class LiteOpActor : public OpActor<lite::Tensor> {
|
||||||
if (input_op_datas_[op_uuid].size() < kernel_->in_tensors().size()) {
|
if (input_op_datas_[op_uuid].size() < kernel_->in_tensors().size()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
Context *ctx = const_cast<Context *>(kernel_->context());
|
||||||
|
if (kernel_->desc().arch == kernel::kCPU) {
|
||||||
|
BindThreads(static_cast<lite::InnerContext *>(ctx)->thread_pool_, true, 2);
|
||||||
|
}
|
||||||
auto ret = RunKernel(*(reinterpret_cast<const KernelCallBack *>(context->kernel_call_back_before_)),
|
auto ret = RunKernel(*(reinterpret_cast<const KernelCallBack *>(context->kernel_call_back_before_)),
|
||||||
*(reinterpret_cast<const KernelCallBack *>(context->kernel_call_back_after_)));
|
*(reinterpret_cast<const KernelCallBack *>(context->kernel_call_back_after_)));
|
||||||
if (ret != RET_OK) {
|
if (ret != RET_OK) {
|
||||||
|
@ -51,6 +55,9 @@ class LiteOpActor : public OpActor<lite::Tensor> {
|
||||||
}
|
}
|
||||||
input_op_datas_.erase(op_uuid);
|
input_op_datas_.erase(op_uuid);
|
||||||
AsyncOutput(context);
|
AsyncOutput(context);
|
||||||
|
if (kernel_->desc().arch == kernel::kCPU) {
|
||||||
|
BindThreads(static_cast<lite::InnerContext *>(ctx)->thread_pool_, true, 2);
|
||||||
|
}
|
||||||
SetOutputData(context);
|
SetOutputData(context);
|
||||||
}
|
}
|
||||||
void Init() {
|
void Init() {
|
||||||
|
@ -82,11 +89,15 @@ class LiteOpActor : public OpActor<lite::Tensor> {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
void AddResultIndex(size_t index);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void SetOutputData(OpContext<Tensor> *context);
|
void SetOutputData(OpContext<Tensor> *context);
|
||||||
void AsyncOutput(OpContext<Tensor> *context);
|
void AsyncOutput(OpContext<Tensor> *context);
|
||||||
|
|
||||||
kernel::LiteKernel *kernel_;
|
kernel::LiteKernel *kernel_;
|
||||||
|
std::vector<size_t> results_index_;
|
||||||
};
|
};
|
||||||
|
|
||||||
int MindrtInit();
|
int MindrtInit();
|
||||||
|
|
|
@ -51,6 +51,7 @@ int MindrtExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels) {
|
||||||
for (size_t j = 0; j < outTensorSize; j++) {
|
for (size_t j = 0; j < outTensorSize; j++) {
|
||||||
auto data =
|
auto data =
|
||||||
std::make_shared<OpData<Tensor>>(opActors_[i]->GetAID(), kernels[i]->out_tensors()[j], static_cast<int>(j));
|
std::make_shared<OpData<Tensor>>(opActors_[i]->GetAID(), kernels[i]->out_tensors()[j], static_cast<int>(j));
|
||||||
|
opActors_[i]->AddResultIndex(outputData_.size());
|
||||||
outputData_.emplace_back(data);
|
outputData_.emplace_back(data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -156,26 +156,26 @@ int OpenCLAllocator::GetImgDtypeSize(const ImageSize &img_size) {
|
||||||
|
|
||||||
void *OpenCLAllocator::_Malloc(MemType mem_type, void *data, size_t size, const ImageSize &img_size) {
|
void *OpenCLAllocator::_Malloc(MemType mem_type, void *data, size_t size, const ImageSize &img_size) {
|
||||||
auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
|
auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
|
||||||
|
auto enable_arm_import_memory = ocl_runtime_->isExtensionEnable(EXT_ARM_IMPORT_MEMORY_HOST);
|
||||||
|
if (mem_type == MemType::SHARED && !enable_arm_import_memory) {
|
||||||
|
mem_type = MemType::BUF;
|
||||||
|
}
|
||||||
if (mem_type == MemType::IMG) {
|
if (mem_type == MemType::IMG) {
|
||||||
size = GetImgDtypeSize(img_size);
|
size = GetImgDtypeSize(img_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (size > ocl_runtime_->GetMaxAllocSize()) {
|
if (size > ocl_runtime_->GetMaxAllocSize()) {
|
||||||
MS_LOG(ERROR) << "MallocData out of max_size, size: " << size;
|
MS_LOG(ERROR) << "MallocData out of max_size, size: " << size;
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
Lock();
|
Lock();
|
||||||
void *host_ptr = MinimumFit(mem_type, size, img_size);
|
void *host_ptr = MinimumFit(mem_type, size, img_size);
|
||||||
if (host_ptr != nullptr && data == nullptr) {
|
UNLOCK_AND_RETURN_NULL(host_ptr != nullptr && data == nullptr, host_ptr);
|
||||||
UnLock();
|
|
||||||
return host_ptr;
|
|
||||||
}
|
|
||||||
total_size_ += size;
|
total_size_ += size;
|
||||||
const uint64_t max_size = ocl_runtime_->GetGlobalMemSize() * 0.8;
|
const uint64_t max_size = ocl_runtime_->GetGlobalMemSize() * 0.8;
|
||||||
if (total_size_ >= max_size) {
|
UNLOCK_AND_RETURN_NULL(total_size_ >= max_size, nullptr);
|
||||||
UnLock();
|
|
||||||
MS_LOG(ERROR) << "Mem pool out of max_size, total size: " << total_size_ << ", max size: " << max_size;
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
cl::Buffer *buffer = nullptr;
|
cl::Buffer *buffer = nullptr;
|
||||||
cl::Image2D *image = nullptr;
|
cl::Image2D *image = nullptr;
|
||||||
cl_mem_flags flags = CL_MEM_READ_WRITE;
|
cl_mem_flags flags = CL_MEM_READ_WRITE;
|
||||||
|
@ -184,27 +184,32 @@ void *OpenCLAllocator::_Malloc(MemType mem_type, void *data, size_t size, const
|
||||||
flags |= (svm_capabilities & CL_DEVICE_SVM_ATOMICS) ? CL_MEM_SVM_ATOMICS : 0;
|
flags |= (svm_capabilities & CL_DEVICE_SVM_ATOMICS) ? CL_MEM_SVM_ATOMICS : 0;
|
||||||
host_ptr = clSVMAlloc((*ocl_runtime_->Context())(), flags, size, 0);
|
host_ptr = clSVMAlloc((*ocl_runtime_->Context())(), flags, size, 0);
|
||||||
} else {
|
} else {
|
||||||
flags |= (data == nullptr) ? CL_MEM_ALLOC_HOST_PTR : CL_MEM_COPY_HOST_PTR;
|
if (mem_type == MemType::SHARED) {
|
||||||
if (mem_type == MemType::BUF || data == nullptr) {
|
size = UP_ROUND(size, ocl_runtime_->GetCacheLineSize());
|
||||||
host_ptr = CreateBuffer(size, data, flags, &buffer);
|
host_ptr = malloc(size);
|
||||||
if (host_ptr == nullptr) {
|
UNLOCK_AND_RETURN_NULL(host_ptr == nullptr, nullptr);
|
||||||
UnLock();
|
|
||||||
return nullptr;
|
buffer = ocl_runtime_->CreateSharedMemoryBuffer(size, host_ptr);
|
||||||
|
} else {
|
||||||
|
flags |= (data == nullptr) ? CL_MEM_ALLOC_HOST_PTR : CL_MEM_COPY_HOST_PTR;
|
||||||
|
if (mem_type == MemType::BUF || data == nullptr) {
|
||||||
|
host_ptr = CreateBuffer(size, data, flags, &buffer);
|
||||||
|
UNLOCK_AND_RETURN_NULL(host_ptr == nullptr, nullptr);
|
||||||
}
|
}
|
||||||
}
|
if (mem_type == MemType::IMG) {
|
||||||
if (mem_type == MemType::IMG) {
|
void *host_ptr_im = CreateImage2D(size, img_size, data, flags, data != nullptr, &buffer, &image);
|
||||||
void *host_ptr_im = CreateImage2D(size, img_size, data, flags, data != nullptr, &buffer, &image);
|
UNLOCK_AND_RETURN_NULL(data != nullptr && host_ptr_im == nullptr, nullptr);
|
||||||
if (data != nullptr && host_ptr_im == nullptr) {
|
host_ptr = (data != nullptr) ? host_ptr_im : host_ptr;
|
||||||
UnLock();
|
|
||||||
return nullptr;
|
|
||||||
}
|
}
|
||||||
host_ptr = (data != nullptr) ? host_ptr_im : host_ptr;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
MemBuf *mem_buf = new (std::nothrow) MemBuf;
|
MemBuf *mem_buf = new (std::nothrow) MemBuf;
|
||||||
if (mem_buf == nullptr) {
|
if (mem_buf == nullptr) {
|
||||||
delete buffer;
|
delete buffer;
|
||||||
delete image;
|
delete image;
|
||||||
|
if (mem_type == MemType::SHARED) {
|
||||||
|
free(host_ptr);
|
||||||
|
}
|
||||||
UnLock();
|
UnLock();
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -216,7 +221,9 @@ void *OpenCLAllocator::_Malloc(MemType mem_type, void *data, size_t size, const
|
||||||
mem_buf->img_size_ = img_size;
|
mem_buf->img_size_ = img_size;
|
||||||
allocated_list_[host_ptr] = mem_buf;
|
allocated_list_[host_ptr] = mem_buf;
|
||||||
UnLock();
|
UnLock();
|
||||||
std::string type_name = mem_type == MemType::BUF ? "buffer" : "Image2D";
|
std::string type_name = (mem_type == MemType::BUF) ? "buffer" : "Image2D";
|
||||||
|
type_name = (mem_type == MemType::SHARED) ? "shared" : type_name;
|
||||||
|
|
||||||
MS_LOG(DEBUG) << "Malloc a new " << type_name << ". size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
|
MS_LOG(DEBUG) << "Malloc a new " << type_name << ". size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
|
||||||
<< ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image
|
<< ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image
|
||||||
<< ", total size: " << total_size_;
|
<< ", total size: " << total_size_;
|
||||||
|
@ -306,6 +313,10 @@ void OpenCLAllocator::ClearMemList(T *list) {
|
||||||
delete image;
|
delete image;
|
||||||
it->second->image_ptr_ = nullptr;
|
it->second->image_ptr_ = nullptr;
|
||||||
}
|
}
|
||||||
|
if (it->second->mem_type_ == MemType::SHARED) {
|
||||||
|
free(it->second->host_ptr_);
|
||||||
|
it->second->host_ptr_ = nullptr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
delete it->second;
|
delete it->second;
|
||||||
}
|
}
|
||||||
|
@ -351,12 +362,18 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue,
|
||||||
}
|
}
|
||||||
MemBuf *mem_buf = it->second;
|
MemBuf *mem_buf = it->second;
|
||||||
MS_ASSERT(mem_buf);
|
MS_ASSERT(mem_buf);
|
||||||
|
if (mem_buf->mem_type_ == MemType::SHARED) {
|
||||||
|
UnLock();
|
||||||
|
MS_LOG(WARNING) << "Host ptr " << host_ptr << " no need map";
|
||||||
|
return host_ptr;
|
||||||
|
}
|
||||||
|
|
||||||
void *new_host_ptr{nullptr};
|
void *new_host_ptr{nullptr};
|
||||||
if (mem_buf->mem_type_ == MemType::BUF) {
|
if (mem_buf->mem_type_ == MemType::BUF) {
|
||||||
cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_);
|
cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_);
|
||||||
MS_ASSERT(buffer);
|
MS_ASSERT(buffer);
|
||||||
new_host_ptr = ocl_runtime_->MapBuffer(*buffer, flags, mem_buf->size_, nullptr, sync);
|
new_host_ptr = ocl_runtime_->MapBuffer(*buffer, flags, mem_buf->size_, nullptr, sync);
|
||||||
} else {
|
} else if (mem_buf->mem_type_ == MemType::IMG) {
|
||||||
std::vector<size_t> region{mem_buf->img_size_.width, mem_buf->img_size_.height, 1};
|
std::vector<size_t> region{mem_buf->img_size_.width, mem_buf->img_size_.height, 1};
|
||||||
cl::Image2D *image = static_cast<cl::Image2D *>(mem_buf->image_ptr_);
|
cl::Image2D *image = static_cast<cl::Image2D *>(mem_buf->image_ptr_);
|
||||||
MS_ASSERT(image);
|
MS_ASSERT(image);
|
||||||
|
|
|
@ -28,9 +28,17 @@
|
||||||
#include "CL/cl2.hpp"
|
#include "CL/cl2.hpp"
|
||||||
|
|
||||||
namespace mindspore::lite::opencl {
|
namespace mindspore::lite::opencl {
|
||||||
|
#define UNLOCK_AND_RETURN_NULL(condition, ptr) \
|
||||||
|
do { \
|
||||||
|
if (condition) { \
|
||||||
|
UnLock(); \
|
||||||
|
return (ptr); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
class OpenCLRuntime;
|
class OpenCLRuntime;
|
||||||
enum class MemType : char { BUF, IMG };
|
enum class MemType : char { BUF, IMG, SHARED };
|
||||||
|
|
||||||
struct ImageSize {
|
struct ImageSize {
|
||||||
size_t width = 0;
|
size_t width = 0;
|
||||||
size_t height = 0;
|
size_t height = 0;
|
||||||
|
@ -45,8 +53,11 @@ class OpenCLAllocator : public mindspore::Allocator {
|
||||||
explicit OpenCLAllocator(OpenCLRuntime *ocl_runtime);
|
explicit OpenCLAllocator(OpenCLRuntime *ocl_runtime);
|
||||||
~OpenCLAllocator() override;
|
~OpenCLAllocator() override;
|
||||||
void SetContext(const AllocatorContext &ctx) override;
|
void SetContext(const AllocatorContext &ctx) override;
|
||||||
|
void *Malloc(size_t size, MemType type) { return _Malloc(type, nullptr, size); }
|
||||||
|
|
||||||
|
// malloc shared
|
||||||
|
void *Malloc(size_t size) override { return _Malloc(MemType::SHARED, nullptr, size); }
|
||||||
// malloc buffer
|
// malloc buffer
|
||||||
void *Malloc(size_t size) override { return _Malloc(MemType::BUF, nullptr, size); }
|
|
||||||
void *Malloc(size_t size, void *data) { return _Malloc(MemType::BUF, data, size); }
|
void *Malloc(size_t size, void *data) { return _Malloc(MemType::BUF, data, size); }
|
||||||
// malloc image
|
// malloc image
|
||||||
void *Malloc(const ImageSize &img_size, void *data = nullptr) { return _Malloc(MemType::IMG, data, 0, img_size); }
|
void *Malloc(const ImageSize &img_size, void *data = nullptr) { return _Malloc(MemType::IMG, data, 0, img_size); }
|
||||||
|
|
|
@ -167,6 +167,8 @@ int OpenCLRuntime::InitGPUDevice(std::vector<cl::Platform> *platforms) {
|
||||||
max_alloc_size_ = device_->getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
|
max_alloc_size_ = device_->getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
|
||||||
max_image2d_width_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
|
max_image2d_width_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
|
||||||
max_image2d_height_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
|
max_image2d_height_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
|
||||||
|
supported_extensions_ = std::string(device_->getInfo<CL_DEVICE_EXTENSIONS>());
|
||||||
|
cache_line_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
|
||||||
MS_LOG(INFO) << "Address space bits: " << device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
|
MS_LOG(INFO) << "Address space bits: " << device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
|
||||||
MS_LOG(INFO) << "Global Mem Size: " << global_memery_size_;
|
MS_LOG(INFO) << "Global Mem Size: " << global_memery_size_;
|
||||||
MS_LOG(INFO) << "Global Mem Cache Size: " << global_memery_cachesize_;
|
MS_LOG(INFO) << "Global Mem Cache Size: " << global_memery_cachesize_;
|
||||||
|
@ -757,4 +759,19 @@ void OpenCLRuntime::StoreCache() {
|
||||||
MS_LOG(INFO) << "store opencl cache ok, size=" << fbb->GetSize();
|
MS_LOG(INFO) << "store opencl cache ok, size=" << fbb->GetSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cl::Buffer *OpenCLRuntime::CreateSharedMemoryBuffer(size_t size, void *host_ptr) {
|
||||||
|
cl_int error = CL_SUCCESS;
|
||||||
|
cl_mem cl_buffer = clImportMemoryARM(context_->get(), CL_MEM_READ_WRITE, NULL, host_ptr, size, &error);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
MS_LOG(ERROR) << "Create OpenCL shared memory failed for" << CLErrorCode(error);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
cl::Buffer *buffer = new (std::nothrow) cl::Buffer(cl_buffer, false);
|
||||||
|
if (buffer == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "New OpenCL Buffer failed";
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace mindspore::lite::opencl
|
} // namespace mindspore::lite::opencl
|
||||||
|
|
|
@ -28,6 +28,7 @@ j* you may not use this file except in compliance with the License.
|
||||||
#include "src/runtime/gpu/opencl/opencl_wrapper.h"
|
#include "src/runtime/gpu/opencl/opencl_wrapper.h"
|
||||||
#include "src/runtime/gpu/opencl/opencl_allocator.h"
|
#include "src/runtime/gpu/opencl/opencl_allocator.h"
|
||||||
#include "schema/gpu_cache_generated.h"
|
#include "schema/gpu_cache_generated.h"
|
||||||
|
#define EXT_ARM_IMPORT_MEMORY_HOST "cl_arm_import_memory_host"
|
||||||
|
|
||||||
namespace mindspore::lite::opencl {
|
namespace mindspore::lite::opencl {
|
||||||
|
|
||||||
|
@ -151,6 +152,9 @@ class OpenCLRuntime {
|
||||||
|
|
||||||
bool isProfiling() const { return profiling_; }
|
bool isProfiling() const { return profiling_; }
|
||||||
void SetProfiling(bool profiling) { profiling_ = profiling; }
|
void SetProfiling(bool profiling) { profiling_ = profiling; }
|
||||||
|
bool isExtensionEnable(std::string ext) { return supported_extensions_.find(ext) != std::string::npos; }
|
||||||
|
cl::Buffer *CreateSharedMemoryBuffer(size_t size, void *host_ptr);
|
||||||
|
uint GetCacheLineSize() const { return cache_line_size_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static OpenCLRuntime *GetInstance();
|
static OpenCLRuntime *GetInstance();
|
||||||
|
@ -196,6 +200,8 @@ class OpenCLRuntime {
|
||||||
bool profiling_{true};
|
bool profiling_{true};
|
||||||
#else
|
#else
|
||||||
bool profiling_{false};
|
bool profiling_{false};
|
||||||
|
std::string supported_extensions_{""};
|
||||||
|
uint cache_line_size_{1};
|
||||||
#endif
|
#endif
|
||||||
// for cache
|
// for cache
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -102,6 +102,7 @@ bool LoadLibraryFromPath(const std::string &library_path, void **handle_ptr) {
|
||||||
LOAD_OPENCL_FUNCTION_PTR(clCreateProgramWithSource);
|
LOAD_OPENCL_FUNCTION_PTR(clCreateProgramWithSource);
|
||||||
LOAD_OPENCL_FUNCTION_PTR(clCreateBuffer);
|
LOAD_OPENCL_FUNCTION_PTR(clCreateBuffer);
|
||||||
LOAD_OPENCL_FUNCTION_PTR(clCreateImage2D);
|
LOAD_OPENCL_FUNCTION_PTR(clCreateImage2D);
|
||||||
|
LOAD_OPENCL_FUNCTION_PTR(clImportMemoryARM);
|
||||||
LOAD_OPENCL_FUNCTION_PTR(clCreateImage3D);
|
LOAD_OPENCL_FUNCTION_PTR(clCreateImage3D);
|
||||||
LOAD_OPENCL_FUNCTION_PTR(clRetainKernel);
|
LOAD_OPENCL_FUNCTION_PTR(clRetainKernel);
|
||||||
LOAD_OPENCL_FUNCTION_PTR(clCreateKernel);
|
LOAD_OPENCL_FUNCTION_PTR(clCreateKernel);
|
||||||
|
@ -192,6 +193,7 @@ CL_DEFINE_FUNC_PTR(clReleaseKernel);
|
||||||
CL_DEFINE_FUNC_PTR(clCreateProgramWithSource);
|
CL_DEFINE_FUNC_PTR(clCreateProgramWithSource);
|
||||||
CL_DEFINE_FUNC_PTR(clCreateBuffer);
|
CL_DEFINE_FUNC_PTR(clCreateBuffer);
|
||||||
CL_DEFINE_FUNC_PTR(clCreateImage2D);
|
CL_DEFINE_FUNC_PTR(clCreateImage2D);
|
||||||
|
CL_DEFINE_FUNC_PTR(clImportMemoryARM);
|
||||||
CL_DEFINE_FUNC_PTR(clCreateImage3D);
|
CL_DEFINE_FUNC_PTR(clCreateImage3D);
|
||||||
CL_DEFINE_FUNC_PTR(clRetainKernel);
|
CL_DEFINE_FUNC_PTR(clRetainKernel);
|
||||||
CL_DEFINE_FUNC_PTR(clCreateKernel);
|
CL_DEFINE_FUNC_PTR(clCreateKernel);
|
||||||
|
|
|
@ -90,6 +90,7 @@ using clRetainKernelFunc = cl_int (*)(cl_kernel kernel);
|
||||||
using clCreateBufferFunc = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
|
using clCreateBufferFunc = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
|
||||||
using clCreateImage2DFunc = cl_mem (*)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t,
|
using clCreateImage2DFunc = cl_mem (*)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t,
|
||||||
void *, cl_int *);
|
void *, cl_int *);
|
||||||
|
using clImportMemoryARMFunc = cl_mem (*)(cl_context, cl_mem_flags, const cl_image_format *, void *, ssize_t, cl_int *);
|
||||||
using clCreateImage3DFunc = cl_mem (*)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t,
|
using clCreateImage3DFunc = cl_mem (*)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t,
|
||||||
size_t, size_t, void *, cl_int *);
|
size_t, size_t, void *, cl_int *);
|
||||||
using clCreateProgramWithSourceFunc = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
|
using clCreateProgramWithSourceFunc = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
|
||||||
|
@ -143,6 +144,7 @@ CL_DECLARE_FUNC_PTR(clReleaseKernel);
|
||||||
CL_DECLARE_FUNC_PTR(clCreateProgramWithSource);
|
CL_DECLARE_FUNC_PTR(clCreateProgramWithSource);
|
||||||
CL_DECLARE_FUNC_PTR(clCreateBuffer);
|
CL_DECLARE_FUNC_PTR(clCreateBuffer);
|
||||||
CL_DECLARE_FUNC_PTR(clCreateImage2D);
|
CL_DECLARE_FUNC_PTR(clCreateImage2D);
|
||||||
|
CL_DECLARE_FUNC_PTR(clImportMemoryARM);
|
||||||
CL_DECLARE_FUNC_PTR(clCreateImage3D);
|
CL_DECLARE_FUNC_PTR(clCreateImage3D);
|
||||||
CL_DECLARE_FUNC_PTR(clRetainKernel);
|
CL_DECLARE_FUNC_PTR(clRetainKernel);
|
||||||
CL_DECLARE_FUNC_PTR(clCreateKernel);
|
CL_DECLARE_FUNC_PTR(clCreateKernel);
|
||||||
|
|
|
@ -70,6 +70,7 @@ void GroupConvolutionBaseCPUKernel::FreeSubKernel() {
|
||||||
delete sub_conv;
|
delete sub_conv;
|
||||||
sub_conv = nullptr;
|
sub_conv = nullptr;
|
||||||
}
|
}
|
||||||
|
group_convs_.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
int GroupConvolutionBaseCPUKernel::PreProcess() {
|
int GroupConvolutionBaseCPUKernel::PreProcess() {
|
||||||
|
|
|
@ -113,6 +113,7 @@ void GroupConvCreator::FreeGroupConvs() {
|
||||||
}
|
}
|
||||||
delete sub_conv;
|
delete sub_conv;
|
||||||
}
|
}
|
||||||
|
group_convs_.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
int GroupConvCreator::NewInputTensor(std::vector<lite::Tensor *> *tensors) {
|
int GroupConvCreator::NewInputTensor(std::vector<lite::Tensor *> *tensors) {
|
||||||
|
|
|
@ -133,8 +133,8 @@ void ArgMinMaxOpenCLKernel::SetGlobalLocal() {
|
||||||
int ArgMinMaxOpenCLKernel::InitWeights() {
|
int ArgMinMaxOpenCLKernel::InitWeights() {
|
||||||
auto allocator = ocl_runtime_->GetAllocator();
|
auto allocator = ocl_runtime_->GetAllocator();
|
||||||
int dtype_size = ocl_runtime_->GetFp16Enable() ? sizeof(int16_t) : sizeof(float);
|
int dtype_size = ocl_runtime_->GetFp16Enable() ? sizeof(int16_t) : sizeof(float);
|
||||||
buff_ = allocator->Malloc(in_tensors_[0]->ElementsNum() * dtype_size);
|
buff_ = allocator->Malloc(in_tensors_[0]->ElementsNum() * dtype_size, lite::opencl::MemType::BUF);
|
||||||
ids_ = allocator->Malloc(in_tensors_[0]->ElementsNum() * sizeof(int32_t));
|
ids_ = allocator->Malloc(in_tensors_[0]->ElementsNum() * sizeof(int32_t), lite::opencl::MemType::BUF);
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -90,10 +90,10 @@ int BatchNormOpenCLKernel::Initweight() {
|
||||||
auto weight_tensor = in_tensors_.at(1);
|
auto weight_tensor = in_tensors_.at(1);
|
||||||
size_t weight_size = img_info.OriginSize;
|
size_t weight_size = img_info.OriginSize;
|
||||||
// allocated memory for weight and init value
|
// allocated memory for weight and init value
|
||||||
scale_ = allocator->Malloc(weight_size);
|
scale_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||||
offset_ = allocator->Malloc(weight_size);
|
offset_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||||
mean_ = allocator->Malloc(weight_size);
|
mean_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||||
variance_ = allocator->Malloc(weight_size);
|
variance_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||||
|
|
||||||
allocator->MapBuffer(scale_, CL_MAP_WRITE, nullptr, true);
|
allocator->MapBuffer(scale_, CL_MAP_WRITE, nullptr, true);
|
||||||
allocator->MapBuffer(offset_, CL_MAP_WRITE, nullptr, true);
|
allocator->MapBuffer(offset_, CL_MAP_WRITE, nullptr, true);
|
||||||
|
|
|
@ -254,7 +254,7 @@ void Conv2DOpenCLKernel::InitFilter() {
|
||||||
packed_filter_ = allocator->Malloc({width, height, dtype});
|
packed_filter_ = allocator->Malloc({width, height, dtype});
|
||||||
} else {
|
} else {
|
||||||
size = UP_DIV(CO_SLICES_, Ogroup) * KH_ * KW_ * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_;
|
size = UP_DIV(CO_SLICES_, Ogroup) * KH_ * KW_ * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_;
|
||||||
packed_filter_ = allocator->Malloc(size);
|
packed_filter_ = allocator->Malloc(size, lite::opencl::MemType::BUF);
|
||||||
}
|
}
|
||||||
|
|
||||||
// rearrange filter
|
// rearrange filter
|
||||||
|
@ -287,7 +287,7 @@ void Conv2DOpenCLKernel::InitBias() {
|
||||||
// align bias from C to C4
|
// align bias from C to C4
|
||||||
auto bias_tensor = in_tensors_.at(2);
|
auto bias_tensor = in_tensors_.at(2);
|
||||||
size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_;
|
size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_;
|
||||||
packed_bias_ = allocator->Malloc(packed_bias_size);
|
packed_bias_ = allocator->Malloc(packed_bias_size, lite::opencl::MemType::BUF);
|
||||||
|
|
||||||
allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true);
|
allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true);
|
||||||
memset(packed_bias_, 0x00, packed_bias_size);
|
memset(packed_bias_, 0x00, packed_bias_size);
|
||||||
|
|
|
@ -144,7 +144,7 @@ int Conv2dTransposeOpenCLKernel::InitFilter() {
|
||||||
|
|
||||||
// IHWO to OHWI4(I)4(O)(converter format is IHWO)
|
// IHWO to OHWI4(I)4(O)(converter format is IHWO)
|
||||||
// init padWeight_(buffer mem)
|
// init padWeight_(buffer mem)
|
||||||
padWeight_ = allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size);
|
padWeight_ = allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size, lite::opencl::MemType::BUF);
|
||||||
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
||||||
memset(padWeight_, 0x00, div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size);
|
memset(padWeight_, 0x00, div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size);
|
||||||
auto origin_weight = in_tensors_.at(kWeightIndex)->data_c();
|
auto origin_weight = in_tensors_.at(kWeightIndex)->data_c();
|
||||||
|
|
|
@ -133,7 +133,8 @@ int FullConnectionOpenCLKernel::InitFilter() {
|
||||||
int co4 = UP_DIV(CO_, C4NUM);
|
int co4 = UP_DIV(CO_, C4NUM);
|
||||||
int nhw_remainder = intensor_shape.N * intensor_shape.H * intensor_shape.W / N_;
|
int nhw_remainder = intensor_shape.N * intensor_shape.H * intensor_shape.W / N_;
|
||||||
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
||||||
padWeight_ = allocator->Malloc(nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size);
|
padWeight_ = allocator->Malloc(nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size,
|
||||||
|
lite::opencl::MemType::BUF);
|
||||||
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
||||||
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
|
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
|
||||||
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
|
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
|
||||||
|
|
|
@ -183,7 +183,7 @@ int FusionEltwiseOpenCLKernel::InitWeights() {
|
||||||
auto tensor_info = GpuTensorInfo(tensor);
|
auto tensor_info = GpuTensorInfo(tensor);
|
||||||
size_t num = tensor_info.ElementsNum;
|
size_t num = tensor_info.ElementsNum;
|
||||||
size_t size = tensor_info.Image2DSize;
|
size_t size = tensor_info.Image2DSize;
|
||||||
void *buffer = allocator->Malloc(size);
|
void *buffer = allocator->Malloc(size, lite::opencl::MemType::BUF);
|
||||||
allocator->MapBuffer(buffer, CL_MAP_WRITE, nullptr, true);
|
allocator->MapBuffer(buffer, CL_MAP_WRITE, nullptr, true);
|
||||||
memset(buffer, 0x00, size);
|
memset(buffer, 0x00, size);
|
||||||
if (tensor->data_type() == kNumberTypeFloat16) {
|
if (tensor->data_type() == kNumberTypeFloat16) {
|
||||||
|
|
|
@ -129,7 +129,8 @@ int GatherOpenCLKernel::ConvertTensorToweight() {
|
||||||
auto allocator = ocl_runtime_->GetAllocator();
|
auto allocator = ocl_runtime_->GetAllocator();
|
||||||
auto indices_tensor = in_tensors_.at(1);
|
auto indices_tensor = in_tensors_.at(1);
|
||||||
auto indices_num = indices_tensor->ElementsNum();
|
auto indices_num = indices_tensor->ElementsNum();
|
||||||
indices_data_ = reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num));
|
indices_data_ =
|
||||||
|
reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num), lite::opencl::MemType::BUF);
|
||||||
allocator->MapBuffer(indices_data_, CL_MAP_WRITE, nullptr, true);
|
allocator->MapBuffer(indices_data_, CL_MAP_WRITE, nullptr, true);
|
||||||
if (indices_data_ == nullptr) {
|
if (indices_data_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "Memory allocation failed";
|
MS_LOG(ERROR) << "Memory allocation failed";
|
||||||
|
@ -154,7 +155,8 @@ int GatherOpenCLKernel::InitWeights() {
|
||||||
auto indices_tensor = in_tensors_.at(1);
|
auto indices_tensor = in_tensors_.at(1);
|
||||||
auto indices_num = indices_tensor->ElementsNum();
|
auto indices_num = indices_tensor->ElementsNum();
|
||||||
auto allocator = ocl_runtime_->GetAllocator();
|
auto allocator = ocl_runtime_->GetAllocator();
|
||||||
indices_data_ = reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num));
|
indices_data_ =
|
||||||
|
reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num), lite::opencl::MemType::BUF);
|
||||||
if (indices_data_ == nullptr) {
|
if (indices_data_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "Memory allocation failed";
|
MS_LOG(ERROR) << "Memory allocation failed";
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
|
|
|
@ -106,8 +106,8 @@ int LayerNormOpenCLKernel::Initweight() {
|
||||||
auto weight_tensor = in_tensors_.at(1);
|
auto weight_tensor = in_tensors_.at(1);
|
||||||
size_t weight_size = img_info.Image2DSize;
|
size_t weight_size = img_info.Image2DSize;
|
||||||
// allocated memory for weight and init value
|
// allocated memory for weight and init value
|
||||||
gamma_ = allocator->Malloc(weight_size);
|
gamma_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||||
beta_ = allocator->Malloc(weight_size);
|
beta_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||||
allocator->MapBuffer(gamma_, CL_MAP_WRITE, nullptr, true);
|
allocator->MapBuffer(gamma_, CL_MAP_WRITE, nullptr, true);
|
||||||
allocator->MapBuffer(beta_, CL_MAP_WRITE, nullptr, true);
|
allocator->MapBuffer(beta_, CL_MAP_WRITE, nullptr, true);
|
||||||
memset(gamma_, 0x01, weight_size);
|
memset(gamma_, 0x01, weight_size);
|
||||||
|
@ -164,8 +164,8 @@ int LayerNormOpenCLKernel::Prepare() {
|
||||||
}
|
}
|
||||||
size_t size_dtype = use_fp16_enable_ ? sizeof(float16_t) : sizeof(float);
|
size_t size_dtype = use_fp16_enable_ ? sizeof(float16_t) : sizeof(float);
|
||||||
mean_size *= size_dtype;
|
mean_size *= size_dtype;
|
||||||
mean_ = allocator->Malloc(mean_size);
|
mean_ = allocator->Malloc(mean_size, lite::opencl::MemType::BUF);
|
||||||
var_ = allocator->Malloc(mean_size);
|
var_ = allocator->Malloc(mean_size, lite::opencl::MemType::BUF);
|
||||||
std::string kernel_name = "LayerNormalization_NHWC4";
|
std::string kernel_name = "LayerNormalization_NHWC4";
|
||||||
std::string kernel_name_mean_var = "ComputeMeanVar";
|
std::string kernel_name_mean_var = "ComputeMeanVar";
|
||||||
std::string source = layer_norm_source;
|
std::string source = layer_norm_source;
|
||||||
|
|
|
@ -130,7 +130,7 @@ int MatMulOpenCLKernel::InitWeights() {
|
||||||
int b = weight_shape_4d[1];
|
int b = weight_shape_4d[1];
|
||||||
|
|
||||||
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
||||||
padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
|
padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size, lite::opencl::MemType::BUF);
|
||||||
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
||||||
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
|
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
|
||||||
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
|
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
|
||||||
|
|
|
@ -46,7 +46,7 @@ int PReluOpenCLKernel::InitWeights() {
|
||||||
int C_ = weight_tensor->ElementsNum();
|
int C_ = weight_tensor->ElementsNum();
|
||||||
auto sizeof_FLT = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
|
auto sizeof_FLT = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
|
||||||
size_t weight_size = UP_ROUND(C_, C4NUM) * sizeof_FLT;
|
size_t weight_size = UP_ROUND(C_, C4NUM) * sizeof_FLT;
|
||||||
weight_vector_ = allocator->Malloc(weight_size);
|
weight_vector_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||||
allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true);
|
allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true);
|
||||||
memset(weight_vector_, 0x00, weight_size);
|
memset(weight_vector_, 0x00, weight_size);
|
||||||
if (weight_tensor->data_type() == kNumberTypeFloat16) {
|
if (weight_tensor->data_type() == kNumberTypeFloat16) {
|
||||||
|
|
|
@ -62,7 +62,7 @@ int SparseToDenseOpenCLKernel::InitWeights() {
|
||||||
} else {
|
} else {
|
||||||
auto sizeof_FLT = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
|
auto sizeof_FLT = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
|
||||||
size_t weight_size = UP_ROUND(size, C4NUM) * sizeof_FLT;
|
size_t weight_size = UP_ROUND(size, C4NUM) * sizeof_FLT;
|
||||||
weight_vector_ = allocator->Malloc(weight_size);
|
weight_vector_ = allocator->Malloc(weight_size, lite::opencl::MemType::BUF);
|
||||||
allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true);
|
allocator->MapBuffer(weight_vector_, CL_MAP_WRITE, nullptr, true);
|
||||||
memset(weight_vector_, 0x00, weight_size);
|
memset(weight_vector_, 0x00, weight_size);
|
||||||
if (weight_tensor->data_type() == kNumberTypeFloat16) {
|
if (weight_tensor->data_type() == kNumberTypeFloat16) {
|
||||||
|
|
|
@ -94,13 +94,13 @@ void SplitOpenCLKernel::AlignSplitSizes(SplitParameter *param, const std::vector
|
||||||
int shape_dim = in_shape.at(param->split_dim_);
|
int shape_dim = in_shape.at(param->split_dim_);
|
||||||
if (num_split_ == 1) {
|
if (num_split_ == 1) {
|
||||||
size_t num_split = UP_DIV(shape_dim, param->split_sizes_[0]);
|
size_t num_split = UP_DIV(shape_dim, param->split_sizes_[0]);
|
||||||
split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split * sizeof(int)));
|
split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split * sizeof(int), lite::opencl::MemType::BUF));
|
||||||
for (int i = 0; i < num_split - 1; ++i) {
|
for (int i = 0; i < num_split - 1; ++i) {
|
||||||
split_sizes_[i] = (i + 1) * param->split_sizes_[0];
|
split_sizes_[i] = (i + 1) * param->split_sizes_[0];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split_ * sizeof(int)));
|
split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split_ * sizeof(int), lite::opencl::MemType::BUF));
|
||||||
for (int i = 0; i < num_split_ - 1; ++i) {
|
for (int i = 0; i < num_split_ - 1; ++i) {
|
||||||
sum += param->split_sizes_[i];
|
sum += param->split_sizes_[i];
|
||||||
split_sizes_[i] = sum;
|
split_sizes_[i] = sum;
|
||||||
|
|
|
@ -55,7 +55,7 @@ void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
|
||||||
size_t dtype_size = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
|
size_t dtype_size = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
|
||||||
size_t memB = NumB * NumB * dtype_size;
|
size_t memB = NumB * NumB * dtype_size;
|
||||||
for (int depth = 0; depth < MAXDEPTH; depth++) {
|
for (int depth = 0; depth < MAXDEPTH; depth++) {
|
||||||
B_temp[depth] = allocator->Malloc(memB);
|
B_temp[depth] = allocator->Malloc(memB, lite::opencl::MemType::BUF);
|
||||||
A_temp[depth] = allocator->Malloc(img_size);
|
A_temp[depth] = allocator->Malloc(img_size);
|
||||||
M1[depth] = allocator->Malloc(img_size);
|
M1[depth] = allocator->Malloc(img_size);
|
||||||
M2[depth] = allocator->Malloc(img_size);
|
M2[depth] = allocator->Malloc(img_size);
|
||||||
|
@ -73,7 +73,7 @@ int StrassenOpenCLKernel::InitWeights() {
|
||||||
int NumA = in_tensors_[0]->shape()[0];
|
int NumA = in_tensors_[0]->shape()[0];
|
||||||
int NumB = in_tensors_[1]->shape()[0];
|
int NumB = in_tensors_[1]->shape()[0];
|
||||||
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
||||||
padWeight_ = allocator->Malloc(NumA * NumB * dtype_size);
|
padWeight_ = allocator->Malloc(NumA * NumB * dtype_size, lite::opencl::MemType::BUF);
|
||||||
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
||||||
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
|
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
|
||||||
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
|
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
|
||||||
|
|
|
@ -102,7 +102,7 @@ void WinogradOpenCLKernel::InitFilter() {
|
||||||
packed_filter_ = allocator->Malloc({width, height, dtype});
|
packed_filter_ = allocator->Malloc({width, height, dtype});
|
||||||
} else {
|
} else {
|
||||||
size = UP_DIV(CO_SLICES_, Ogroup) * 6 * 6 * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_;
|
size = UP_DIV(CO_SLICES_, Ogroup) * 6 * 6 * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_;
|
||||||
packed_filter_ = allocator->Malloc(size);
|
packed_filter_ = allocator->Malloc(size, MemType::BUF);
|
||||||
}
|
}
|
||||||
|
|
||||||
// rearrange filter
|
// rearrange filter
|
||||||
|
|
|
@ -74,7 +74,7 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
|
||||||
this->graph_output_node_indexes_ = GetGraphOutputNodes(src_model_);
|
this->graph_output_node_indexes_ = GetGraphOutputNodes(src_model_);
|
||||||
|
|
||||||
#ifdef SUBGRAPH_SPLIT
|
#ifdef SUBGRAPH_SPLIT
|
||||||
auto search_sub_graph = SearchSubGraph(src_model_, this->graph_output_node_indexes_);
|
auto search_sub_graph = SearchSubGraph(context_, src_model_, this->graph_output_node_indexes_);
|
||||||
search_sub_graph.SubGraphSplitByOutput();
|
search_sub_graph.SubGraphSplitByOutput();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -357,6 +357,7 @@ kernel::LiteKernel *Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_ten
|
||||||
const std::vector<Tensor *> &out_tensors, OpParameter *op_parameter,
|
const std::vector<Tensor *> &out_tensors, OpParameter *op_parameter,
|
||||||
const kernel::KernelKey &desc) {
|
const kernel::KernelKey &desc) {
|
||||||
MS_ASSERT(op_parameter != nullptr);
|
MS_ASSERT(op_parameter != nullptr);
|
||||||
|
|
||||||
if (context_->IsGpuEnabled()) {
|
if (context_->IsGpuEnabled()) {
|
||||||
// support more data type like int32
|
// support more data type like int32
|
||||||
kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type};
|
kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type};
|
||||||
|
@ -433,6 +434,7 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
|
||||||
kernel::KernelKey desc{kCPU, data_type, static_cast<schema::PrimitiveType>(op_parameter->type_)};
|
kernel::KernelKey desc{kCPU, data_type, static_cast<schema::PrimitiveType>(op_parameter->type_)};
|
||||||
kernel::LiteKernel *kernel = nullptr;
|
kernel::LiteKernel *kernel = nullptr;
|
||||||
#ifdef SUPPORT_GPU
|
#ifdef SUPPORT_GPU
|
||||||
|
// if (node->device_type_ == DT_GPU || node->device_type_ == DEFAULT) {
|
||||||
kernel = FindGpuKernel(in_tensors, out_tensors, op_parameter, desc);
|
kernel = FindGpuKernel(in_tensors, out_tensors, op_parameter, desc);
|
||||||
if (kernel != nullptr) {
|
if (kernel != nullptr) {
|
||||||
return kernel;
|
return kernel;
|
||||||
|
@ -447,8 +449,10 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// }
|
||||||
#endif
|
#endif
|
||||||
#ifdef SUPPORT_NPU
|
#ifdef SUPPORT_NPU
|
||||||
|
// if (node->device_type_ == DT_NPU || node->device_type_ == DEFAULT) {
|
||||||
kernel = FindNpuKernel(in_tensors, out_tensors, op_parameter, desc);
|
kernel = FindNpuKernel(in_tensors, out_tensors, op_parameter, desc);
|
||||||
if (kernel != nullptr) {
|
if (kernel != nullptr) {
|
||||||
return kernel;
|
return kernel;
|
||||||
|
@ -463,6 +467,7 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// }
|
||||||
#endif
|
#endif
|
||||||
if (prefer_data_type == kNumberTypeFloat16 || prefer_data_type == kTypeUnknown) {
|
if (prefer_data_type == kNumberTypeFloat16 || prefer_data_type == kTypeUnknown) {
|
||||||
kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat16);
|
kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat16);
|
||||||
|
@ -617,34 +622,37 @@ bool Scheduler::KernelFitCurrentSubGraph(const kernel::SubGraphType subgraph_typ
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<kernel::LiteKernel *> Scheduler::FindAllSubGraphKernels(
|
std::vector<kernel::LiteKernel *> Scheduler::FindAllSubGraphKernels(
|
||||||
kernel::LiteKernel *head_kernel, std::map<const kernel::LiteKernel *, bool> *sinked_kernel_map) {
|
std::vector<kernel::LiteKernel *> head_kernels, std::map<const kernel::LiteKernel *, bool> *sinked_kernel_map) {
|
||||||
MS_ASSERT(head_kernel != nullptr);
|
|
||||||
MS_ASSERT(sinked_kernel_map != nullptr);
|
|
||||||
std::vector<kernel::LiteKernel *> sub_kernels;
|
std::vector<kernel::LiteKernel *> sub_kernels;
|
||||||
if (head_kernel->Type() == schema::PrimitiveType_Switch || head_kernel->Type() == schema::PrimitiveType_Merge) {
|
|
||||||
(*sinked_kernel_map)[head_kernel] = true;
|
for (kernel::LiteKernel *head_kernel : head_kernels) {
|
||||||
sub_kernels.emplace_back(head_kernel);
|
MS_ASSERT(head_kernel != nullptr);
|
||||||
return sub_kernels;
|
MS_ASSERT(sinked_kernel_map != nullptr);
|
||||||
}
|
if (head_kernel->Type() == schema::PrimitiveType_Switch || head_kernel->Type() == schema::PrimitiveType_Merge) {
|
||||||
std::queue<kernel::LiteKernel *> kernel_queue;
|
(*sinked_kernel_map)[head_kernel] = true;
|
||||||
kernel_queue.emplace(head_kernel);
|
sub_kernels.emplace_back(head_kernel);
|
||||||
auto cur_sub_graph_type = mindspore::lite::Scheduler::GetKernelSubGraphType(head_kernel);
|
return sub_kernels;
|
||||||
while (!kernel_queue.empty()) {
|
}
|
||||||
auto cur_kernel = kernel_queue.front();
|
std::queue<kernel::LiteKernel *> kernel_queue;
|
||||||
kernel_queue.pop();
|
kernel_queue.emplace(head_kernel);
|
||||||
(*sinked_kernel_map)[cur_kernel] = true;
|
auto cur_sub_graph_type = mindspore::lite::Scheduler::GetKernelSubGraphType(head_kernel);
|
||||||
sub_kernels.emplace_back(cur_kernel);
|
while (!kernel_queue.empty()) {
|
||||||
auto post_kernels = cur_kernel->out_kernels();
|
auto cur_kernel = kernel_queue.front();
|
||||||
for (auto post_kernel : post_kernels) {
|
kernel_queue.pop();
|
||||||
if (post_kernel->subgraph_type() != kernel::kNotSubGraph || post_kernel->Type() == schema::PrimitiveType_Merge ||
|
(*sinked_kernel_map)[cur_kernel] = true;
|
||||||
post_kernel->Type() == schema::PrimitiveType_Switch) {
|
sub_kernels.emplace_back(cur_kernel);
|
||||||
continue;
|
auto post_kernels = cur_kernel->out_kernels();
|
||||||
}
|
for (auto post_kernel : post_kernels) {
|
||||||
if (cur_sub_graph_type == mindspore::lite::Scheduler::GetKernelSubGraphType(post_kernel)) {
|
if (post_kernel->subgraph_type() != kernel::kNotSubGraph ||
|
||||||
auto post_kernel_inputs = post_kernel->in_kernels();
|
post_kernel->Type() == schema::PrimitiveType_Merge || post_kernel->Type() == schema::PrimitiveType_Switch) {
|
||||||
if (std::all_of(post_kernel_inputs.begin(), post_kernel_inputs.end(),
|
continue;
|
||||||
[&](kernel::LiteKernel *kernel) { return (*sinked_kernel_map)[kernel]; })) {
|
}
|
||||||
kernel_queue.emplace(post_kernel);
|
if (cur_sub_graph_type == mindspore::lite::Scheduler::GetKernelSubGraphType(post_kernel)) {
|
||||||
|
auto post_kernel_inputs = post_kernel->in_kernels();
|
||||||
|
if (std::all_of(post_kernel_inputs.begin(), post_kernel_inputs.end(),
|
||||||
|
[&](kernel::LiteKernel *kernel) { return (*sinked_kernel_map)[kernel]; })) {
|
||||||
|
kernel_queue.emplace(post_kernel);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -659,11 +667,15 @@ int Scheduler::ConstructSubGraphs(std::vector<kernel::LiteKernel *> src_kernel,
|
||||||
(*is_kernel_finish)[kernel] = false;
|
(*is_kernel_finish)[kernel] = false;
|
||||||
}
|
}
|
||||||
while (true) {
|
while (true) {
|
||||||
|
std::vector<kernel::LiteKernel *> head_kernels;
|
||||||
auto head_kernel_iter = std::find_if(src_kernel.begin(), src_kernel.end(), [&](const kernel::LiteKernel *kernel) {
|
auto head_kernel_iter = std::find_if(src_kernel.begin(), src_kernel.end(), [&](const kernel::LiteKernel *kernel) {
|
||||||
auto kernel_inputs = kernel->in_kernels();
|
auto kernel_inputs = kernel->in_kernels();
|
||||||
if ((*is_kernel_finish)[kernel]) {
|
if ((*is_kernel_finish)[kernel]) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (std::find(head_kernels.begin(), head_kernels.end(), kernel) != head_kernels.end()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
// when merge is removed, this if is removed automatically
|
// when merge is removed, this if is removed automatically
|
||||||
if (kernel->Type() == schema::PrimitiveType_Merge) {
|
if (kernel->Type() == schema::PrimitiveType_Merge) {
|
||||||
return MergeOpIsReady(kernel, (*is_kernel_finish));
|
return MergeOpIsReady(kernel, (*is_kernel_finish));
|
||||||
|
@ -675,25 +687,33 @@ int Scheduler::ConstructSubGraphs(std::vector<kernel::LiteKernel *> src_kernel,
|
||||||
if (head_kernel_iter == src_kernel.end()) {
|
if (head_kernel_iter == src_kernel.end()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto head_kernel = *head_kernel_iter;
|
auto head_kernel = *head_kernel_iter;
|
||||||
if (head_kernel->subgraph_type() != kernel::kNotSubGraph) {
|
if (head_kernel->subgraph_type() != kernel::kNotSubGraph) {
|
||||||
(*is_kernel_finish)[head_kernel] = true;
|
(*is_kernel_finish)[head_kernel] = true;
|
||||||
dst_kernel->push_back(head_kernel);
|
dst_kernel->push_back(head_kernel);
|
||||||
|
|
||||||
|
/* npu support split */
|
||||||
|
/* ConstructSubGraphs(head_kernel->nodes(), dst_kernel, is_kernel_finish); */
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (head_kernel->desc().arch == mindspore::kernel::kAPU) {
|
if (head_kernel->desc().arch == mindspore::kernel::kAPU) {
|
||||||
MS_LOG(ERROR) << "Not support APU now";
|
MS_LOG(ERROR) << "Not support APU now";
|
||||||
return RET_NOT_SUPPORT;
|
return RET_NOT_SUPPORT;
|
||||||
}
|
}
|
||||||
auto cur_sub_graph_type = mindspore::lite::Scheduler::GetKernelSubGraphType(head_kernel);
|
|
||||||
auto sub_kernels = FindAllSubGraphKernels(head_kernel, is_kernel_finish);
|
head_kernels.push_back(head_kernel);
|
||||||
|
|
||||||
|
auto cur_sub_graph_type = mindspore::lite::Scheduler::GetKernelSubGraphType(head_kernels[0]);
|
||||||
|
auto sub_kernels = FindAllSubGraphKernels(head_kernels, is_kernel_finish);
|
||||||
auto subgraph = CreateSubGraphKernel(sub_kernels, nullptr, nullptr, cur_sub_graph_type);
|
auto subgraph = CreateSubGraphKernel(sub_kernels, nullptr, nullptr, cur_sub_graph_type);
|
||||||
if (subgraph == nullptr) {
|
if (subgraph == nullptr) {
|
||||||
MS_LOG(ERROR) << "Create SubGraphKernel failed";
|
MS_LOG(ERROR) << "Create SubGraphKernel failed";
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
dst_kernel->emplace_back(subgraph);
|
dst_kernel->emplace_back(subgraph);
|
||||||
}
|
} /* end when all kernel converted */
|
||||||
|
|
||||||
for (auto *subgraph : *dst_kernel) {
|
for (auto *subgraph : *dst_kernel) {
|
||||||
auto ret = subgraph->Init();
|
auto ret = subgraph->Init();
|
||||||
if (ret != RET_OK) {
|
if (ret != RET_OK) {
|
||||||
|
@ -702,7 +722,7 @@ int Scheduler::ConstructSubGraphs(std::vector<kernel::LiteKernel *> src_kernel,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
} // namespace mindspore::lite
|
||||||
|
|
||||||
bool Scheduler::MergeOpIsReady(const kernel::LiteKernel *kernel,
|
bool Scheduler::MergeOpIsReady(const kernel::LiteKernel *kernel,
|
||||||
std::map<const kernel::LiteKernel *, bool> is_kernel_finish) {
|
std::map<const kernel::LiteKernel *, bool> is_kernel_finish) {
|
||||||
|
|
|
@ -92,7 +92,7 @@ class Scheduler {
|
||||||
bool KernelFitCurrentSubGraph(const kernel::SubGraphType subgraph_type, const kernel::LiteKernel &kernel);
|
bool KernelFitCurrentSubGraph(const kernel::SubGraphType subgraph_type, const kernel::LiteKernel &kernel);
|
||||||
|
|
||||||
std::vector<kernel::LiteKernel *> FindAllSubGraphKernels(
|
std::vector<kernel::LiteKernel *> FindAllSubGraphKernels(
|
||||||
kernel::LiteKernel *head_kernel, std::map<const kernel::LiteKernel *, bool> *sinked_kernel_map);
|
std::vector<kernel::LiteKernel *> head_kernels, std::map<const kernel::LiteKernel *, bool> *sinked_kernel_map);
|
||||||
|
|
||||||
// other methods
|
// other methods
|
||||||
static TypeId GetFirstFp32Fp16OrInt8Type(const std::vector<Tensor *> &in_tensors);
|
static TypeId GetFirstFp32Fp16OrInt8Type(const std::vector<Tensor *> &in_tensors);
|
||||||
|
|
|
@ -18,11 +18,10 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include "src/tensor.h"
|
#include "src/tensor.h"
|
||||||
#include "schema/inner/ops_generated.h"
|
#include "schema/ops_generated.h"
|
||||||
#include "schema/inner/model_generated.h"
|
#include "schema/model_generated.h"
|
||||||
|
|
||||||
namespace mindspore::lite {
|
namespace mindspore::lite {
|
||||||
#ifdef SUBGRAPH_SPLIT
|
|
||||||
const schema::Primitive *SearchSubGraph::CreatePartialPrimitive(int64_t subgraph_index) {
|
const schema::Primitive *SearchSubGraph::CreatePartialPrimitive(int64_t subgraph_index) {
|
||||||
flatbuffers::FlatBufferBuilder fbb(1024);
|
flatbuffers::FlatBufferBuilder fbb(1024);
|
||||||
auto val_offset = schema::CreatePartialFusion(fbb, subgraph_index);
|
auto val_offset = schema::CreatePartialFusion(fbb, subgraph_index);
|
||||||
|
@ -46,7 +45,7 @@ void SearchSubGraph::ConvertSubGraphToModel() {
|
||||||
if (subgraph.nodes_.empty()) {
|
if (subgraph.nodes_.empty()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
mindspore::kernel::KERNEL_ARCH device = subgraph.device_;
|
// DeviceType device = subgraph.device_;
|
||||||
|
|
||||||
int new_sub_index = model_->sub_graphs_.size();
|
int new_sub_index = model_->sub_graphs_.size();
|
||||||
int partial_index = model_->all_nodes_.size();
|
int partial_index = model_->all_nodes_.size();
|
||||||
|
@ -72,7 +71,7 @@ void SearchSubGraph::ConvertSubGraphToModel() {
|
||||||
new_sub_graph->node_indices_.push_back(node_index);
|
new_sub_graph->node_indices_.push_back(node_index);
|
||||||
VectorErase(&main_graphs->node_indices_, node_index);
|
VectorErase(&main_graphs->node_indices_, node_index);
|
||||||
VectorErase(&subgraph.nodes_, node_index);
|
VectorErase(&subgraph.nodes_, node_index);
|
||||||
model_->all_nodes_[node_index]->device_type_ = device;
|
// model_->all_nodes_[node_index]->device_type_ = device;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t head_index : subgraph.heads_) {
|
for (uint32_t head_index : subgraph.heads_) {
|
||||||
|
@ -134,7 +133,7 @@ void SearchSubGraph::InsertNode(uint32_t index, Subgraph *subgraph) {
|
||||||
/* remove const node */
|
/* remove const node */
|
||||||
for (int i = input.size() - 1; i >= 0; i--) {
|
for (int i = input.size() - 1; i >= 0; i--) {
|
||||||
if (tensors_[input[i]].type_ == CONST) {
|
if (tensors_[input[i]].type_ == CONST) {
|
||||||
input.erase(input.begin() + i);
|
VectorErase(&input, input[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,36 +222,62 @@ void SearchSubGraph::InitSearchTensor() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void SearchSubGraph::InitSubgraphDevice() {
|
void SearchSubGraph::InitSubgraphDevice() {
|
||||||
sub_graphs_[0].device_ = kernel::KERNEL_ARCH::kCPU;
|
for (size_t i = 0; i < sub_graphs_.size(); i++) {
|
||||||
sub_graphs_[1].device_ = kernel::KERNEL_ARCH::kALL;
|
sub_graphs_[i].device_ = (i % 2 == 0) ? DT_CPU : DT_GPU;
|
||||||
}
|
|
||||||
|
|
||||||
void SearchSubGraph::InitMainGraphDevice() {
|
|
||||||
kernel::KERNEL_ARCH main_device = kernel::KERNEL_ARCH::kALL;
|
|
||||||
Model::SubGraph *main_graph = model_->sub_graphs_.front();
|
|
||||||
for (uint32_t node_index : main_graph->node_indices_) {
|
|
||||||
Model::Node *node = model_->all_nodes_[node_index];
|
|
||||||
node->device_type_ = main_device;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SearchSubGraph::InitMainGraphDevice() {
|
||||||
|
// DeviceType main_device = DT_GPU;
|
||||||
|
// Model::SubGraph *main_graph = model_->sub_graphs_.front();
|
||||||
|
// for (uint32_t node_index : main_graph->node_indices_) {
|
||||||
|
// Model::Node *node = model_->all_nodes_[node_index];
|
||||||
|
// node->device_type_ = main_device;
|
||||||
|
}
|
||||||
|
|
||||||
void SearchSubGraph::SubgraphFusion() {
|
void SearchSubGraph::SubgraphFusion() {
|
||||||
Subgraph new_npu_sub;
|
while (sub_graphs_.size() > 2) {
|
||||||
Subgraph &npu_sub1 = sub_graphs_[1];
|
size_t sub1_index = 0;
|
||||||
Subgraph &npu_sub2 = sub_graphs_[2];
|
int sub2_index = -1;
|
||||||
new_npu_sub.nodes_.insert(new_npu_sub.nodes_.end(), npu_sub1.nodes_.begin(), npu_sub1.nodes_.end());
|
for (; sub1_index < sub_graphs_.size(); sub1_index++) {
|
||||||
new_npu_sub.nodes_.insert(new_npu_sub.nodes_.end(), npu_sub2.nodes_.begin(), npu_sub2.nodes_.end());
|
for (size_t tmp2 = sub1_index + 1; tmp2 < sub_graphs_.size(); tmp2++) {
|
||||||
new_npu_sub.heads_.insert(new_npu_sub.heads_.end(), npu_sub1.heads_.begin(), npu_sub1.heads_.end());
|
if (sub_graphs_[sub1_index].device_ == sub_graphs_[tmp2].device_) {
|
||||||
new_npu_sub.heads_.insert(new_npu_sub.heads_.end(), npu_sub2.heads_.begin(), npu_sub2.heads_.end());
|
sub2_index = tmp2;
|
||||||
new_npu_sub.ends_.insert(new_npu_sub.ends_.end(), npu_sub1.ends_.begin(), npu_sub1.ends_.end());
|
break;
|
||||||
new_npu_sub.ends_.insert(new_npu_sub.ends_.end(), npu_sub2.ends_.begin(), npu_sub2.ends_.end());
|
}
|
||||||
sub_graphs_.erase(sub_graphs_.begin() + 2);
|
}
|
||||||
sub_graphs_.erase(sub_graphs_.begin() + 1);
|
if (sub2_index != -1) {
|
||||||
sub_graphs_.insert(sub_graphs_.end(), std::move(new_npu_sub));
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MS_ASSERT(sub2_index > sub1_index);
|
||||||
|
|
||||||
|
Subgraph new_npu_sub;
|
||||||
|
Subgraph &npu_sub1 = sub_graphs_[sub1_index];
|
||||||
|
Subgraph &npu_sub2 = sub_graphs_[sub2_index];
|
||||||
|
new_npu_sub.nodes_.insert(new_npu_sub.nodes_.end(), npu_sub1.nodes_.begin(), npu_sub1.nodes_.end());
|
||||||
|
new_npu_sub.nodes_.insert(new_npu_sub.nodes_.end(), npu_sub2.nodes_.begin(), npu_sub2.nodes_.end());
|
||||||
|
new_npu_sub.heads_.insert(new_npu_sub.heads_.end(), npu_sub1.heads_.begin(), npu_sub1.heads_.end());
|
||||||
|
new_npu_sub.heads_.insert(new_npu_sub.heads_.end(), npu_sub2.heads_.begin(), npu_sub2.heads_.end());
|
||||||
|
new_npu_sub.ends_.insert(new_npu_sub.ends_.end(), npu_sub1.ends_.begin(), npu_sub1.ends_.end());
|
||||||
|
new_npu_sub.ends_.insert(new_npu_sub.ends_.end(), npu_sub2.ends_.begin(), npu_sub2.ends_.end());
|
||||||
|
sub_graphs_.erase(sub_graphs_.begin() + sub2_index);
|
||||||
|
sub_graphs_.erase(sub_graphs_.begin() + sub1_index);
|
||||||
|
sub_graphs_.insert(sub_graphs_.end(), std::move(new_npu_sub));
|
||||||
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SearchSubGraph::SubGraphSplitByOutput() {
|
void SearchSubGraph::SubGraphSplitByOutput() {
|
||||||
|
if (!context_->IsGpuEnabled() || output_nodes_.size() > 4) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (context_->IsCpuFloat16Enabled() || context_->IsGpuFloat16Enabled()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
InitSearchTensor();
|
InitSearchTensor();
|
||||||
|
|
||||||
InitSearchSubGraph();
|
InitSearchSubGraph();
|
||||||
|
@ -265,5 +290,4 @@ void SearchSubGraph::SubGraphSplitByOutput() {
|
||||||
|
|
||||||
InitMainGraphDevice();
|
InitMainGraphDevice();
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
} // namespace mindspore::lite
|
} // namespace mindspore::lite
|
||||||
|
|
|
@ -22,9 +22,9 @@
|
||||||
#include "include/model.h"
|
#include "include/model.h"
|
||||||
#include "src/lite_kernel.h"
|
#include "src/lite_kernel.h"
|
||||||
#include "src/lite_model.h"
|
#include "src/lite_model.h"
|
||||||
|
#include "src/inner_context.h"
|
||||||
|
|
||||||
namespace mindspore::lite {
|
namespace mindspore::lite {
|
||||||
#ifdef SUBGRAPH_SPLIT
|
|
||||||
class SearchSubGraph {
|
class SearchSubGraph {
|
||||||
enum TensorType { NORMAL, CONST, INPUT };
|
enum TensorType { NORMAL, CONST, INPUT };
|
||||||
|
|
||||||
|
@ -39,11 +39,11 @@ class SearchSubGraph {
|
||||||
std::vector<uint32_t> heads_;
|
std::vector<uint32_t> heads_;
|
||||||
std::vector<uint32_t> ends_;
|
std::vector<uint32_t> ends_;
|
||||||
bool search_terminate_ = false;
|
bool search_terminate_ = false;
|
||||||
mindspore::kernel::KERNEL_ARCH device_;
|
DeviceType device_;
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
SearchSubGraph(Model *model, std::vector<size_t> output_nodes) {
|
SearchSubGraph(const InnerContext *context, Model *model, std::vector<size_t> output_nodes) : context_(context) {
|
||||||
output_nodes_.insert(output_nodes_.end(), output_nodes.begin(), output_nodes.end());
|
output_nodes_.insert(output_nodes_.end(), output_nodes.begin(), output_nodes.end());
|
||||||
node_list_ = model->all_nodes_;
|
node_list_ = model->all_nodes_;
|
||||||
model_ = reinterpret_cast<LiteModel *>(model);
|
model_ = reinterpret_cast<LiteModel *>(model);
|
||||||
|
@ -65,14 +65,13 @@ class SearchSubGraph {
|
||||||
void InitMainGraphDevice();
|
void InitMainGraphDevice();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
const InnerContext *context_ = nullptr;
|
||||||
LiteModel *model_ = nullptr;
|
LiteModel *model_ = nullptr;
|
||||||
std::vector<Tensor> tensors_;
|
std::vector<Tensor> tensors_;
|
||||||
std::vector<Subgraph> sub_graphs_;
|
std::vector<Subgraph> sub_graphs_;
|
||||||
std::vector<size_t> output_nodes_;
|
std::vector<size_t> output_nodes_;
|
||||||
std::vector<Model::Node *> node_list_;
|
std::vector<Model::Node *> node_list_;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
|
||||||
} // namespace mindspore::lite
|
} // namespace mindspore::lite
|
||||||
|
|
||||||
#endif // MINDSPORE_LITE_SRC_SUB_GRAPH_SPLIT_H_
|
#endif // MINDSPORE_LITE_SRC_SUB_GRAPH_SPLIT_H_
|
||||||
|
|
Loading…
Reference in New Issue