!11419 [MS][LITE][Develop] concat supported in_tensor is weight
From: @pengyongrong Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @ddwsky
This commit is contained in:
commit
d14529d3c2
|
@ -38,7 +38,7 @@ int ConcatOpenCLKernel::RunAxis0() {
|
|||
auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
|
||||
for (int i = 0; i < in_tensors_.size(); i++) {
|
||||
auto src_data = in_tensors_[i]->data_c();
|
||||
auto src_data = inputs_weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data_c() : inputs_weight_ptrs_.at(i);
|
||||
allocator_->GetImageSize(src_data, &img_size);
|
||||
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||
auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1};
|
||||
|
@ -160,10 +160,76 @@ void ConcatOpenCLKernel::SetGlobalLocal() {
|
|||
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
|
||||
}
|
||||
|
||||
int ConcatOpenCLKernel::ConvertWeightToTensor(const std::vector<lite::Tensor *> &in_tensors,
|
||||
std::vector<void *> *inputs_weight_ptrs, bool fp16_enable,
|
||||
size_t data_size) {
|
||||
for (auto in_tensor_ : in_tensors) {
|
||||
auto nhwc_shape = GetNHWCShape(in_tensor_->shape());
|
||||
if (!in_tensor_->IsConst()) {
|
||||
(*inputs_weight_ptrs).push_back(nullptr);
|
||||
} else {
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
std::vector<size_t> img_size = GetImage2dShapeFromNHWC(nhwc_shape, schema::Format_NHWC4);
|
||||
int pack_weight_size = img_size[0] * img_size[1] * C4NUM;
|
||||
int plane = nhwc_shape[1] * nhwc_shape[2];
|
||||
int channel = nhwc_shape[3];
|
||||
int batch = nhwc_shape[0];
|
||||
img_size.push_back(fp16_enable ? CL_HALF_FLOAT : CL_FLOAT);
|
||||
if (!fp16_enable) {
|
||||
float *weight = new (std::nothrow) float[pack_weight_size];
|
||||
if (weight == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(weight, 0x00, pack_weight_size * data_size);
|
||||
if (in_tensor_->data_type() == kNumberTypeFloat32) {
|
||||
std::function<float(float)> to_dtype = [](float x) -> float { return x; };
|
||||
PackNHWCToNHWC4<float, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
|
||||
} else if (in_tensor_->data_type() == kNumberTypeFloat16) {
|
||||
std::function<float(float16_t)> to_dtype = [](float16_t x) -> float { return static_cast<float>(x); };
|
||||
PackNHWCToNHWC4<float16_t, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
|
||||
}
|
||||
if (batch * plane * channel == 1) {
|
||||
// scalar
|
||||
weight[3] = weight[2] = weight[1] = weight[0];
|
||||
}
|
||||
auto weight_ptr_ = allocator->Malloc(pack_weight_size, img_size, weight);
|
||||
(*inputs_weight_ptrs).push_back(weight_ptr_);
|
||||
delete[] weight;
|
||||
} else {
|
||||
float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
|
||||
if (weight == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(weight, 0x00, pack_weight_size * data_size);
|
||||
if (in_tensor_->data_type() == kNumberTypeFloat32) {
|
||||
std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
|
||||
PackNHWCToNHWC4<float, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
|
||||
} else if (in_tensor_->data_type() == kNumberTypeFloat16) {
|
||||
std::function<float16_t(float16_t)> to_dtype = [](float16_t x) -> float16_t { return x; };
|
||||
PackNHWCToNHWC4<float16_t, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
|
||||
}
|
||||
if (batch * plane * channel == 1) {
|
||||
// scalar
|
||||
weight[3] = weight[2] = weight[1] = weight[0];
|
||||
}
|
||||
auto weight_ptr_ = allocator->Malloc(pack_weight_size, img_size, weight);
|
||||
(*inputs_weight_ptrs).push_back(weight_ptr_);
|
||||
delete[] weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConcatOpenCLKernel::Prepare() {
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
auto data_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
|
||||
ConvertWeightToTensor(in_tensors_, &inputs_weight_ptrs_, enable_fp16_, data_size);
|
||||
if (axis_ == 0) {
|
||||
for (int i = 0; i < in_tensors_.size(); ++i) {
|
||||
if (in_tensors_.at(0)->shape().size() != 1) {
|
||||
if (in_tensors_.at(i)->shape().size() != 1) {
|
||||
return RET_OK;
|
||||
}
|
||||
}
|
||||
|
@ -175,7 +241,7 @@ int ConcatOpenCLKernel::Prepare() {
|
|||
Align_ = false;
|
||||
}
|
||||
}
|
||||
enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
|
||||
std::string kernel_name = "Concat";
|
||||
if (axis_ == 3 && !Align_) {
|
||||
kernel_name += "Input" + std::to_string(in_tensors_.size()) + "UnAlign";
|
||||
|
@ -202,7 +268,8 @@ int ConcatOpenCLKernel::Run() {
|
|||
}
|
||||
int arg_cn = 0;
|
||||
for (int i = 0; i < in_tensors_.size(); ++i) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c());
|
||||
auto input_ptr = inputs_weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data_c() : inputs_weight_ptrs_.at(i);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_ptr);
|
||||
}
|
||||
if (axis_ == 3 && !Align_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
|
|
|
@ -43,6 +43,7 @@ class ConcatOpenCLKernel : public OpenCLKernel {
|
|||
uint32_t OC = {1};
|
||||
std::vector<size_t> global;
|
||||
bool Align_{true};
|
||||
std::vector<void *> inputs_weight_ptrs_;
|
||||
bool enable_fp16_{false};
|
||||
cl_int stride_w{1};
|
||||
cl_int4 in_shape_{};
|
||||
|
@ -51,6 +52,8 @@ class ConcatOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
private:
|
||||
int RunAxis0();
|
||||
int ConvertWeightToTensor(const std::vector<lite::Tensor *> &in_tensors, std::vector<void *> *inputs_weight_ptrs,
|
||||
bool fp16_enable, size_t data_size);
|
||||
};
|
||||
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -60,7 +60,7 @@ void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
|
|||
img_size.push_back(UP_DIV(NumA, C4NUM));
|
||||
img_size.push_back(NumA);
|
||||
size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
|
||||
size_t dtype_size = enable_fp16_ ? sizeof(CL_HALF_FLOAT) : sizeof(CL_FLOAT);
|
||||
size_t dtype_size = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
|
||||
img_size.push_back(img_dtype);
|
||||
auto allocator = ocl_runtime_->GetAllocator();
|
||||
size_t memA = NumA * NumA;
|
||||
|
@ -178,29 +178,6 @@ void StrassenOpenCLKernel::SetConstArgs() {
|
|||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset);
|
||||
}
|
||||
|
||||
// OriginSize = N*H*W*C typesize = sizeof(type data) width = W * UP_DIV(C,C4NUM) size = N
|
||||
void StrassenOpenCLKernel::PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size) {
|
||||
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
||||
int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
|
||||
auto runtime = runtime_wrapper.GetInstance();
|
||||
runtime->SyncCommandQueue();
|
||||
MS_ASSERT(alignment);
|
||||
size_t row_pitch = UP_ROUND(width, alignment) * typesize * C4NUM;
|
||||
size_t OriginSize = size * size * typesize;
|
||||
std::vector<char> data(OriginSize);
|
||||
auto row_size = width * typesize * C4NUM;
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
memcpy(reinterpret_cast<char *>(data.data()) + i * row_size, static_cast<char *>(IMGData) + i * row_pitch,
|
||||
row_size);
|
||||
}
|
||||
for (int i = 0; i < size * size; ++i) {
|
||||
if ((i + 1) % size == 0) {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void StrassenOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size,
|
||||
cl_int2 offset, lite::opencl::MemType mem_type) {
|
||||
if (input == nullptr || output == nullptr) {
|
||||
|
@ -344,7 +321,7 @@ void StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, co
|
|||
|
||||
int StrassenOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
int threshold = 0;
|
||||
int threshold;
|
||||
const int up_bound = 1024;
|
||||
const int down_bound = 256;
|
||||
if (in_tensors_.at(0)->shape()[0] >= up_bound) {
|
||||
|
|
|
@ -48,7 +48,6 @@ class StrassenOpenCLKernel : public MatMulOpenCLKernel {
|
|||
void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5,
|
||||
void *input6, void *input7, void *output, const int size);
|
||||
void StrassenRunMmatmul(void *input, void *weight, void *output, const int size);
|
||||
void PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size);
|
||||
cl::Kernel kernel_IMG_add_sub_2;
|
||||
cl::Kernel MatMul_StrassenBUFFilled;
|
||||
cl::Kernel MatMul_StrassenIMGFilled;
|
||||
|
|
Loading…
Reference in New Issue