optimize_transpose_fp16

This commit is contained in:
sunsuodong 2021-05-26 14:17:21 +08:00
parent 3cc2b6513c
commit 69a11ce6e7
18 changed files with 188 additions and 311 deletions

View File

@ -196,34 +196,34 @@
} \
}
#define TRANSPOSE_DIMS(TYPE, NAME) \
void TransposeDims##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape, int *size, int *position, \
TransposeParameter *transpose_param, int task_id, int thread_num) { \
int *perm = transpose_param->perm_; \
int *strides = transpose_param->strides_; \
int *out_strides = transpose_param->out_strides_; \
int num_axes = transpose_param->num_axes_; \
size_t data_size = (*size) * output_shape[0]; \
size_t offset_size = UP_DIV(data_size, thread_num); \
size_t task_offset = offset_size * task_id; \
int count = data_size - task_offset; \
if (count <= 0) { \
return; \
} \
count = MSMIN(offset_size, count); \
for (size_t idx = task_offset; idx < task_offset + count; ++idx) { \
int pos = idx; \
int output_idx = 0; \
int input_idx = 0; \
for (int i = 0; i < num_axes; ++i) { \
*(position + i) = pos / *(size + i); \
int out_stride = i < num_axes - 1 ? out_strides[i] : 1; \
output_idx += (*(position + i) * out_stride); \
input_idx += (*(position + i) * strides[perm[i]]); \
pos -= *(position + i) * (*(size + i)); \
} \
out_data[output_idx] = in_data[input_idx]; \
} \
#define TRANSPOSE_DIMS(TYPE, NAME) \
void TransposeDims##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape, \
TransposeParameter *transpose_param, int task_id, int thread_num) { \
int *perm = transpose_param->perm_; \
int *strides = transpose_param->strides_; \
int *out_strides = transpose_param->out_strides_; \
int num_axes = transpose_param->num_axes_; \
size_t data_size = (*out_strides) * output_shape[0]; \
size_t offset_size = UP_DIV(data_size, thread_num); \
size_t task_offset = offset_size * task_id; \
int count = data_size - task_offset; \
if (count <= 0) { \
return; \
} \
count = MSMIN(offset_size, count); \
for (size_t idx = task_offset; idx < task_offset + count; ++idx) { \
int pos = idx; \
int output_idx = 0; \
int input_idx = 0; \
for (int i = 0; i < num_axes; ++i) { \
int position = pos / *(out_strides + i); \
int out_stride = i < num_axes - 1 ? out_strides[i] : 1; \
output_idx += (position * out_stride); \
input_idx += (position * strides[perm[i]]); \
pos -= position * (*(out_strides + i)); \
} \
out_data[output_idx] = in_data[input_idx]; \
} \
}
#define DOTRANSPOSE(TYPE, NAME) \

View File

@ -40,21 +40,21 @@ int DoTransposeInt64(const int64_t *in_data, int64_t *out_data, const int *outpu
TransposeParameter *transpose_param);
int DoTransposeBool(const bool *in_data, bool *out_data, const int *output_shape, TransposeParameter *transpose_param);
void TransposeDimsUInt8(const uint8_t *in_data, uint8_t *out_data, const int *output_shape, int *size, int *position,
void TransposeDimsUInt8(const uint8_t *in_data, uint8_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsUInt16(const uint16_t *in_data, uint16_t *out_data, const int *output_shape, int *size, int *position,
void TransposeDimsUInt16(const uint16_t *in_data, uint16_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsUInt32(const uint32_t *in_data, uint32_t *out_data, const int *output_shape, int *size, int *position,
void TransposeDimsUInt32(const uint32_t *in_data, uint32_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsUInt64(const uint64_t *in_data, uint64_t *out_data, const int *output_shape, int *size, int *position,
void TransposeDimsUInt64(const uint64_t *in_data, uint64_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsInt16(const int16_t *in_data, int16_t *out_data, const int *output_shape, int *size, int *position,
void TransposeDimsInt16(const int16_t *in_data, int16_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsInt32(const int32_t *in_data, int32_t *out_data, const int *output_shape, int *size, int *position,
void TransposeDimsInt32(const int32_t *in_data, int32_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsInt64(const int64_t *in_data, int64_t *out_data, const int *output_shape, int *size, int *position,
void TransposeDimsInt64(const int64_t *in_data, int64_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsBool(const bool *in_data, bool *out_data, const int *output_shape, int *size, int *position,
void TransposeDimsBool(const bool *in_data, bool *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num);
#ifdef __cplusplus

View File

@ -159,7 +159,8 @@ void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
}
}
void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int channel) {
void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int channel, int task_id,
int thread_count) {
#ifdef ENABLE_ARM64
// Transpose16x8 in arm64
const int hw_tile = C16NUM;
@ -167,13 +168,27 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
// Transpose8x8 in others
const int hw_tile = C8NUM;
#endif
int hw_align = plane / hw_tile * hw_tile;
int hw_align = plane / hw_tile;
int task_start = 0;
int task_end = plane;
if (thread_count > 0) {
int offset_hw = UP_DIV(hw_align, thread_count) * hw_tile;
task_start = offset_hw * task_id;
int count = plane - task_start;
if (count <= 0) {
return;
}
task_end = (task_id + 1) == thread_count ? plane : MSMIN(plane, task_start + offset_hw);
hw_align = task_start + ((task_end - task_start) >= offset_hw ? offset_hw : 0);
} else {
hw_align *= hw_tile;
}
int c8 = channel / C8NUM * C8NUM;
int batch = plane * channel;
for (int n = 0; n < batches; n++) {
const float16_t *src_batch = (const float16_t *)src + n * batch;
float16_t *dst_batch = (float16_t *)dst + n * batch;
int hw = 0;
int hw = task_start;
for (; hw < hw_align; hw += hw_tile) {
int c = 0;
for (; c < c8; c += C8NUM) {
@ -203,7 +218,7 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
}
}
}
for (; hw < plane; hw++) {
for (; hw < task_end; hw++) {
const float16_t *src_ptr = src_batch + hw * channel;
float16_t *dst_ptr = dst_batch + hw;
for (size_t i = 0; i < channel; i++) {
@ -213,8 +228,8 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
}
}
void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel) {
return PackNHWCToNCHWFp16(src, dst, batch, channel, plane);
void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count) {
return PackNHWCToNCHWFp16(src, dst, batch, channel, plane, task_id, thread_count);
}
void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {

View File

@ -37,9 +37,9 @@ void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel);
void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel);
void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
void PackNHWCToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel);

View File

@ -173,38 +173,44 @@ void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, int *strid
}
}
void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape, int dims, int *size, int *position) {
*(size + dims - 1) = 1;
for (int i = dims - 1; i > 0; --i) {
*(size + i - 1) = *(size + i) * output_shape[i];
void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
TransposeParameter *param, int task_id, int thread_num) {
int *perm = param->perm_;
int *strides = param->strides_;
int *out_strides = param->out_strides_;
int num_axes = param->num_axes_;
size_t data_size = (*out_strides) * output_shape[0];
size_t offset_size = UP_DIV(data_size, thread_num);
size_t task_offset = offset_size * task_id;
int count = data_size - task_offset;
if (count <= 0) {
return;
}
for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) {
count = MSMIN(offset_size, count);
for (size_t idx = task_offset; idx < task_offset + count; ++idx) {
int pos = idx;
int output_idx = 0;
int input_idx = 0;
for (int i = 0; i < dims; ++i) {
*(position + i) = pos / *(size + i);
int out_stride = i < dims - 1 ? out_strides[i] : 1;
output_idx += (*(position + i) * out_stride);
input_idx += (*(position + i) * strides[perm[i]]);
pos -= *(position + i) * (*(size + i));
for (int i = 0; i < num_axes; ++i) {
int position = pos / *(out_strides + i);
int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
output_idx += (position * out_stride);
input_idx += (position * strides[perm[i]]);
pos -= position * (*(out_strides + i));
}
out_data[output_idx] = in_data[input_idx];
}
}
int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int *size, int *position) {
int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape, TransposeParameter *param) {
if (in_data == NULL || out_data == NULL) {
return NNACL_ERR;
}
int *perm = transpose_param->perm_;
int *strides = transpose_param->strides_;
int *out_strides = transpose_param->out_strides_;
int data_size = transpose_param->data_size_;
int num_axes = transpose_param->num_axes_;
int *perm = param->perm_;
int *strides = param->strides_;
int *out_strides = param->out_strides_;
int data_size = param->data_size_;
int num_axes = param->num_axes_;
// check if transpose is needed
bool needTranspose = false;
@ -235,7 +241,7 @@ int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *ou
} else if (num_axes == 6) {
Fp16TransposeDim6(in_data, out_data, strides, out_strides, perm, output_shape);
} else {
TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position);
return NNACL_ERR;
}
return NNACL_OK;
}

View File

@ -24,8 +24,9 @@
#ifdef __cplusplus
extern "C" {
#endif
int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int *size, int *position);
void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
TransposeParameter *param, int task_id, int thread_num);
int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape, TransposeParameter *param);
#ifdef __cplusplus
}
#endif

View File

@ -171,13 +171,13 @@ void TransposeDim6Fp32(const float *in_data, float *out_data, const int *strides
}
}
void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, const int *size, int *position,
void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num) {
int *perm = transpose_param->perm_;
int *strides = transpose_param->strides_;
int *out_strides = transpose_param->out_strides_;
int num_axes = transpose_param->num_axes_;
size_t data_size = (*size) * output_shape[0];
size_t data_size = (*out_strides) * output_shape[0];
size_t offset_size = UP_DIV(data_size, thread_num);
size_t task_offset = offset_size * task_id;
int count = data_size - task_offset;
@ -190,11 +190,11 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_
int output_idx = 0;
int input_idx = 0;
for (int i = 0; i < num_axes; ++i) {
*(position + i) = pos / *(size + i);
int position = pos / *(out_strides + i);
int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
output_idx += (*(position + i) * out_stride);
input_idx += (*(position + i) * strides[perm[i]]);
pos -= *(position + i) * (*(size + i));
output_idx += (position * out_stride);
input_idx += (position * strides[perm[i]]);
pos -= position * (*(out_strides + i));
}
out_data[output_idx] = in_data[input_idx];
}

View File

@ -26,7 +26,7 @@ extern "C" {
#endif
int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *param);
void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, const int *size, int *position,
void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num);
#ifdef __cplusplus
}

View File

@ -220,13 +220,13 @@ int DoTransposeInt8(const int8_t *in_data, int8_t *out_data, const int *output_s
return NNACL_OK;
}
void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape, const int *size, int *position,
void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num) {
int *perm = transpose_param->perm_;
int *strides = transpose_param->strides_;
int *out_strides = transpose_param->out_strides_;
int num_axes = transpose_param->num_axes_;
size_t data_size = (*size) * output_shape[0];
size_t data_size = (*out_strides) * output_shape[0];
size_t offset_size = UP_DIV(data_size, thread_num);
size_t task_offset = offset_size * task_id;
int count = data_size - task_offset;
@ -239,11 +239,11 @@ void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *outpu
int output_idx = 0;
int input_idx = 0;
for (int i = 0; i < num_axes; ++i) {
*(position + i) = pos / *(size + i);
int position = pos / *(out_strides + i);
int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
output_idx += (*(position + i) * out_stride);
input_idx += (*(position + i) * strides[perm[i]]);
pos -= *(position + i) * (*(size + i));
output_idx += (position * out_stride);
input_idx += (position * strides[perm[i]]);
pos -= position * (*(out_strides + i));
}
out_data[output_idx] = in_data[input_idx];
}

View File

@ -27,7 +27,7 @@ extern "C" {
int DoTransposeInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
TransposeParameter *transpose_param);
void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape, const int *size, int *position,
void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int task_id, int thread_num);
#ifdef __cplusplus
}

View File

@ -127,18 +127,9 @@ void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, con
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
const float block_size = 128.0;
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
int dims = SizeToInt(axes_.size());
int *size = new int[dims];
size[dims - 1] = 1;
for (int i = dims - 1; i > 0; i--) {
size[i - 1] = size[i] * output_shape_[i];
}
int **position = new int *[thread_num];
for (size_t i = 0; i < thread_num; ++i) {
position[i] = new int[dims];
}
std::vector<common::Task> tasks;
std::function<void(const T *, T *, const int *, int *, int *, TransposeParameter *, int, int)> TransposeDims;
std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims;
if constexpr (std::is_same_v<T, int8_t>) {
TransposeDims = &TransposeDimsInt8;
} else if constexpr (std::is_same_v<T, int16_t>) {
@ -162,15 +153,12 @@ void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, con
}
for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) {
auto task = [&, task_id, thread_num]() {
TransposeDims(input_addr, output_addr, output_shape, size, position[task_id], &transpose_param_, task_id,
SizeToInt(thread_num));
TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num));
return common::SUCCESS;
};
tasks.emplace_back(task);
}
common::ThreadPool::GetInstance().SyncRun(tasks);
delete[] size;
delete[] position;
}
} // namespace kernel
} // namespace mindspore

View File

@ -44,7 +44,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
return RET_ERROR;
}
PackNCHWToNHWCFp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
weight_tensor->Batch());
weight_tensor->Batch(), 0, 0);
bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t)));
if (bias_data_ == nullptr) {

View File

@ -13,9 +13,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/transpose_fp16.h"
#include <vector>
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp16/transpose_fp16.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
@ -30,63 +30,19 @@ using mindspore::lite::RET_OP_EXECUTE_FAILURE;
using mindspore::schema::PrimitiveType_Transpose;
namespace mindspore::kernel {
int TransposeFp16CPUKernel::Init() {
if (!InferShapeDone()) {
return RET_OK;
}
return TransposeCPUKernel::ReSize();
void TransposeFp16CPUKernel::GetNchwToNhwcFunc() { NHNCTransposeFunc_ = PackNCHWToNHWCFp16; }
void TransposeFp16CPUKernel::GetNhwcToNchwFunc() { NHNCTransposeFunc_ = PackNHWCToNCHWFp16; }
int TransposeFp16CPUKernel::TransposeDim2to6() {
return DoTransposeFp16(static_cast<const float16_t *>(in_data_), static_cast<float16_t *>(out_data_), out_shape_,
param_);
}
int TransposeFp16CPUKernel::Run() {
MS_ASSERT(in_tensors_.size() == 1 || in_tensors_.size() == 2);
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
param->data_size_ = in_tensors_[0]->Size();
MS_ASSERT(out_tensors_.size() == 1);
auto &in_tensor = in_tensors_.front();
auto &out_tensor = out_tensors_.front();
if (in_tensor == nullptr || out_tensor == nullptr) {
MS_LOG(ERROR) << "null pointer referencing.";
return RET_ERROR;
}
in_data_fp16_ = reinterpret_cast<float16_t *>(in_tensor->MutableData());
out_data_fp16_ = reinterpret_cast<float16_t *>(out_tensor->MutableData());
MS_ASSERT(in_data_fp16_);
MS_ASSERT(out_data_fp16_);
if (in_tensor->shape().size() != static_cast<size_t>(param->num_axes_)) {
memcpy(out_data_fp16_, in_data_fp16_, in_tensor->ElementsNum() * sizeof(float16_t));
return RET_OK;
}
int dims = out_tensor->shape().size();
if (dims > DIMENSION_6D) {
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
if (dim_size_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR;
}
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
if (position_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
context_->allocator->Free(dim_size_);
dim_size_ = nullptr;
return RET_ERROR;
}
}
MS_ASSERT(out_shape_);
auto ret = Fp16DoTranspose(in_data_fp16_, out_data_fp16_, out_shape_, param, dim_size_, position_);
if (dims > DIMENSION_6D) {
context_->allocator->Free(dim_size_);
context_->allocator->Free(position_);
dim_size_ = nullptr;
position_ = nullptr;
}
if (ret != RET_OK) {
MS_LOG(ERROR) << "Transpose run failed";
return RET_ERROR;
}
return ret;
int TransposeFp16CPUKernel::TransposeDimGreaterThan6(int task_id) {
TransposeDimsFp16(static_cast<const float16_t *>(in_data_), static_cast<float16_t *>(out_data_), out_shape_, param_,
task_id, op_parameter_->thread_num_);
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Transpose, LiteKernelCreator<TransposeFp16CPUKernel>)

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,7 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP16_TRANSPOSE_FP16_H_
#define MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP16_TRANSPOSE_FP16_H_
@ -32,12 +31,11 @@ class TransposeFp16CPUKernel : public TransposeCPUKernel {
: TransposeCPUKernel(param, inputs, outputs, ctx) {}
~TransposeFp16CPUKernel() = default;
int Init() override;
int Run() override;
private:
float16_t *in_data_fp16_ = nullptr;
float16_t *out_data_fp16_ = nullptr;
void GetNchwToNhwcFunc() override;
void GetNhwcToNchwFunc() override;
int TransposeDim2to6() override;
int TransposeDimGreaterThan6(int task_id) override;
};
} // namespace mindspore::kernel

View File

@ -13,7 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp32/transpose_fp32.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
@ -36,16 +35,15 @@ int TransposeCPUKernel::Init() {
}
int TransposeCPUKernel::ReSize() {
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(op_parameter_);
if (in_tensors_.size() == 2) {
param->num_axes_ = in_tensors_.at(1)->ElementsNum();
param_->num_axes_ = in_tensors_.at(1)->ElementsNum();
}
int trans3d[3] = {0, 2, 1};
int *perm_data = nullptr;
auto input_tensor = in_tensors_.at(kInputIndex);
if (input_tensor->shape().size() != static_cast<size_t>(param->num_axes_)) {
if (input_tensor->shape().size() == 3 && param->num_axes_ == 4) {
param->num_axes_ = 3;
if (input_tensor->shape().size() != static_cast<size_t>(param_->num_axes_)) {
if (input_tensor->shape().size() == 3 && param_->num_axes_ == 4) {
param_->num_axes_ = 3;
perm_data = trans3d;
} else {
return RET_OK;
@ -55,21 +53,19 @@ int TransposeCPUKernel::ReSize() {
auto perm_tensor = in_tensors_.at(1);
perm_data = reinterpret_cast<int *>(perm_tensor->data_c());
}
// set perm data
MS_ASSERT(perm_data != nullptr);
for (int i = 0; i < param->num_axes_; ++i) {
param->perm_[i] = perm_data[i];
for (int i = 0; i < param_->num_axes_; ++i) {
param_->perm_[i] = perm_data[i];
}
auto &inTensor = in_tensors_.front();
auto &outTensor = out_tensors_.front();
auto in_shape = inTensor->shape();
auto out_shape = outTensor->shape();
param->strides_[param->num_axes_ - 1] = 1;
param->out_strides_[param->num_axes_ - 1] = 1;
param->data_size_ = inTensor->Size();
for (int i = param->num_axes_ - 2; i >= 0; i--) {
param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1];
param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1];
param_->strides_[param_->num_axes_ - 1] = 1;
param_->out_strides_[param_->num_axes_ - 1] = 1;
param_->data_size_ = inTensor->Size();
for (int i = param_->num_axes_ - 2; i >= 0; i--) {
param_->strides_[i] = in_shape.at(i + 1) * param_->strides_[i + 1];
param_->out_strides_[i] = out_shape.at(i + 1) * param_->out_strides_[i + 1];
}
if (this->out_shape_ != nullptr) {
@ -92,35 +88,49 @@ TransposeCPUKernel::~TransposeCPUKernel() {
}
}
void TransposeCPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor,
TransposeParameter *param) {
void TransposeCPUKernel::GetNchwToNhwcFunc() { NHNCTransposeFunc_ = PackNCHWToNHWCFp32; }
void TransposeCPUKernel::GetNhwcToNchwFunc() { NHNCTransposeFunc_ = PackNHWCToNCHWFp32; }
int TransposeCPUKernel::TransposeDim2to6() {
return DoTransposeFp32(static_cast<const float *>(in_data_), static_cast<float *>(out_data_), out_shape_, param_);
}
int TransposeCPUKernel::TransposeDimGreaterThan6(int task_id) {
TransposeDimsFp32(static_cast<const float *>(in_data_), static_cast<float *>(out_data_), out_shape_, param_, task_id,
op_parameter_->thread_num_);
return RET_OK;
}
void TransposeCPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor) {
if (in_tensor->shape().size() != 4) {
return;
}
auto out_shape = out_tensor->shape();
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 &&
param->perm_[3] == 1) {
if (param_->perm_[0] == 0 && param_->perm_[1] == 2 && param_->perm_[2] == 3 && param_->perm_[3] == 1) {
nhnc_param_[0] = out_shape[0];
nhnc_param_[1] = out_shape[1] * out_shape[2];
nhnc_param_[2] = out_shape[3];
if (in_tensor->data_type() == kNumberTypeFloat32) {
NHNCTransposeFunc_ = PackNCHWToNHWCFp32;
GetNchwToNhwcFunc();
}
}
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
param->perm_[3] == 2) {
if (param_->perm_[0] == 0 && param_->perm_[1] == 3 && param_->perm_[2] == 1 && param_->perm_[3] == 2) {
nhnc_param_[0] = out_shape[0];
nhnc_param_[1] = out_shape[2] * out_shape[3];
nhnc_param_[2] = out_shape[1];
if (in_tensor->data_type() == kNumberTypeFloat32) {
NHNCTransposeFunc_ = PackNHWCToNCHWFp32;
GetNhwcToNchwFunc();
}
}
}
int TransposeCPUKernel::RunImpl(int task_id) {
if (NHNCTransposeFunc_ != nullptr) {
NHNCTransposeFunc_(in_data_, out_data_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2], task_id, thread_count_);
NHNCTransposeFunc_(in_data_, out_data_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2], task_id,
op_parameter_->thread_num_);
} else {
TransposeDimsFp32(in_data_, out_data_, out_shape_, dim_size_, position_ + dims_ * task_id, param_, task_id,
thread_count_);
return TransposeDimGreaterThan6(task_id);
}
return RET_OK;
}
@ -143,63 +153,26 @@ int TransposeCPUKernel::Run() {
MS_LOG(ERROR) << "null pointer dreferencing.";
return RET_ERROR;
}
in_data_ = reinterpret_cast<float *>(in_tensor->MutableData());
out_data_ = reinterpret_cast<float *>(out_tensor->MutableData());
in_data_ = in_tensor->data_c();
out_data_ = out_tensor->data_c();
MS_ASSERT(in_data_);
MS_ASSERT(out_data_);
param_ = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
if (in_tensor->shape().size() != static_cast<size_t>(param_->num_axes_)) {
memcpy(out_data_, in_data_, in_tensor->ElementsNum() * sizeof(float));
memcpy(out_data_, in_data_, in_tensor->Size());
return RET_OK;
}
thread_count_ = op_parameter_->thread_num_;
GetNHNCTransposeFunc(in_tensor, out_tensor, param_);
GetNHNCTransposeFunc(in_tensor, out_tensor);
if (NHNCTransposeFunc_ != nullptr) {
auto ret = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(TransposeImpl, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "NHNCTransposeFunc_ is error!";
}
return ret;
return static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(TransposeImpl, this, op_parameter_->thread_num_);
}
MS_ASSERT(out_shape_);
dims_ = out_tensor->shape().size();
if (dims_ > DIMENSION_6D) {
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int)));
if (dim_size_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
return RET_NULL_PTR;
}
*(dim_size_ + dims_ - 1) = 1;
for (int i = dims_ - 1; i > 0; --i) {
*(dim_size_ + i - 1) = *(dim_size_ + i) * out_shape_[i];
}
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int) * thread_count_));
if (position_ == nullptr) {
context_->allocator->Free(dim_size_);
MS_LOG(ERROR) << "Malloc data failed";
return RET_NULL_PTR;
}
}
int ret;
if (dims_ > DIMENSION_6D) {
ret = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(TransposeImpl, this, thread_count_);
if (out_tensor->shape().size() <= DIMENSION_6D) {
return TransposeDim2to6();
} else {
ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param_);
return static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(TransposeImpl, this, op_parameter_->thread_num_);
}
if (dims_ > DIMENSION_6D) {
context_->allocator->Free(dim_size_);
context_->allocator->Free(position_);
dim_size_ = nullptr;
position_ = nullptr;
}
if (ret != RET_OK) {
MS_LOG(ERROR) << "Transpose run failed";
}
return ret;
}
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,7 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_H_
#define MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_H_
@ -33,7 +32,9 @@ class TransposeCPUKernel : public InnerKernel {
public:
explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(param, inputs, outputs, ctx) {}
: InnerKernel(param, inputs, outputs, ctx) {
param_ = reinterpret_cast<TransposeParameter *>(param);
}
~TransposeCPUKernel() override;
int Init() override;
@ -42,17 +43,18 @@ class TransposeCPUKernel : public InnerKernel {
int RunImpl(int task_id);
protected:
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
float *in_data_ = nullptr;
float *out_data_ = nullptr;
virtual void GetNchwToNhwcFunc();
virtual void GetNhwcToNchwFunc();
virtual int TransposeDim2to6();
virtual int TransposeDimGreaterThan6(int task_id);
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor);
void *in_data_ = nullptr;
void *out_data_ = nullptr;
int *out_shape_ = nullptr;
int *dim_size_ = nullptr;
int *position_ = nullptr;
TransposeParameter *param_ = nullptr;
TransposeFunc NHNCTransposeFunc_ = nullptr;
int thread_count_ = 0;
int nhnc_param_[3] = {0};
int dims_ = 0;
};
} // namespace mindspore::kernel

View File

@ -41,47 +41,6 @@ int TransposeInt8Run(void *cdata, int task_id) {
return RET_OK;
}
void TransposeInt8CPUKernel::FreeTmpBuf() {
if (!extra_dims_) {
return;
}
if (dim_size_ != nullptr) {
context_->allocator->Free(dim_size_);
dim_size_ = nullptr;
}
if (position_ != nullptr) {
context_->allocator->Free(position_);
position_ = nullptr;
}
return;
}
int TransposeInt8CPUKernel::MallocTmpBuf() {
if (!extra_dims_) {
return RET_OK;
}
int dims = out_tensors_.at(0)->shape().size();
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
if (dim_size_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR;
}
*(dim_size_ + dims - 1) = 1;
for (int i = dims - 1; i > 0; --i) {
*(dim_size_ + i - 1) = *(dim_size_ + i) * out_shape_[i];
}
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int) * op_parameter_->thread_num_));
if (position_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
context_->allocator->Free(dim_size_);
dim_size_ = nullptr;
return RET_ERROR;
}
return RET_OK;
}
int TransposeInt8CPUKernel::ReSize() {
auto in_tensor = in_tensors_.front();
auto out_tensor = out_tensors_.front();
@ -105,20 +64,16 @@ int TransposeInt8CPUKernel::ReSize() {
transpose_param_->strides_[i] = in_shape.at(i + 1) * transpose_param_->strides_[i + 1];
transpose_param_->out_strides_[i] = out_shape.at(i + 1) * transpose_param_->out_strides_[i + 1];
}
extra_dims_ = out_shape.size() > DIMENSION_6D;
return RET_OK;
}
int TransposeInt8CPUKernel::DoTranspose(int task_id) {
int dims = out_tensors_.at(0)->shape().size();
MS_ASSERT(in_ptr_);
MS_ASSERT(out_ptr_);
MS_ASSERT(in_shape_);
MS_ASSERT(out_shape_);
MS_ASSERT(transpose_param_);
TransposeDimsInt8(in_ptr_, out_ptr_, out_shape_, dim_size_, position_ + dims * task_id, transpose_param_, task_id,
op_parameter_->thread_num_);
TransposeDimsInt8(in_ptr_, out_ptr_, out_shape_, transpose_param_, task_id, op_parameter_->thread_num_);
return RET_OK;
}
@ -158,22 +113,12 @@ int TransposeInt8CPUKernel::Run() {
memcpy(in_shape_, in_dims.data(), in_dims.size() * sizeof(int));
memcpy(out_shape_, out_dims.data(), out_dims.size() * sizeof(int));
int ret = MallocTmpBuf();
if (ret != RET_OK) {
MS_LOG(ERROR) << "MallocTmpBuf error_code[" << ret << "]";
}
if (extra_dims_) {
ret = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(TransposeInt8Run, this, op_parameter_->thread_num_);
if (out_tensor->shape().size() > DIMENSION_6D) {
return static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(TransposeInt8Run, this, op_parameter_->thread_num_);
} else {
ret = DoTransposeInt8(in_ptr_, out_ptr_, out_shape_, transpose_param_);
return DoTransposeInt8(in_ptr_, out_ptr_, out_shape_, transpose_param_);
}
if (ret != RET_OK) {
MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]";
}
FreeTmpBuf();
return ret;
}
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Transpose, LiteKernelCreator<TransposeInt8CPUKernel>)

View File

@ -43,19 +43,12 @@ class TransposeInt8CPUKernel : public InnerKernel {
public:
int DoTranspose(int task_id);
private:
int MallocTmpBuf();
void FreeTmpBuf();
private:
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
TransposeParameter *transpose_param_;
TransposeFunc NHNCTransposeFunc_ = nullptr;
int8_t *in_ptr_ = nullptr;
int8_t *out_ptr_ = nullptr;
int *dim_size_ = nullptr;
int *position_ = nullptr;
bool extra_dims_ = false;
int in_shape_[20] = {0};
int out_shape_[20] = {0};
int nhnc_param_[3] = {0};