forked from mindspore-Ecosystem/mindspore
optimize_transpose_fp16
This commit is contained in:
parent
3cc2b6513c
commit
69a11ce6e7
|
@ -196,34 +196,34 @@
|
|||
} \
|
||||
}
|
||||
|
||||
#define TRANSPOSE_DIMS(TYPE, NAME) \
|
||||
void TransposeDims##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape, int *size, int *position, \
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num) { \
|
||||
int *perm = transpose_param->perm_; \
|
||||
int *strides = transpose_param->strides_; \
|
||||
int *out_strides = transpose_param->out_strides_; \
|
||||
int num_axes = transpose_param->num_axes_; \
|
||||
size_t data_size = (*size) * output_shape[0]; \
|
||||
size_t offset_size = UP_DIV(data_size, thread_num); \
|
||||
size_t task_offset = offset_size * task_id; \
|
||||
int count = data_size - task_offset; \
|
||||
if (count <= 0) { \
|
||||
return; \
|
||||
} \
|
||||
count = MSMIN(offset_size, count); \
|
||||
for (size_t idx = task_offset; idx < task_offset + count; ++idx) { \
|
||||
int pos = idx; \
|
||||
int output_idx = 0; \
|
||||
int input_idx = 0; \
|
||||
for (int i = 0; i < num_axes; ++i) { \
|
||||
*(position + i) = pos / *(size + i); \
|
||||
int out_stride = i < num_axes - 1 ? out_strides[i] : 1; \
|
||||
output_idx += (*(position + i) * out_stride); \
|
||||
input_idx += (*(position + i) * strides[perm[i]]); \
|
||||
pos -= *(position + i) * (*(size + i)); \
|
||||
} \
|
||||
out_data[output_idx] = in_data[input_idx]; \
|
||||
} \
|
||||
#define TRANSPOSE_DIMS(TYPE, NAME) \
|
||||
void TransposeDims##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape, \
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num) { \
|
||||
int *perm = transpose_param->perm_; \
|
||||
int *strides = transpose_param->strides_; \
|
||||
int *out_strides = transpose_param->out_strides_; \
|
||||
int num_axes = transpose_param->num_axes_; \
|
||||
size_t data_size = (*out_strides) * output_shape[0]; \
|
||||
size_t offset_size = UP_DIV(data_size, thread_num); \
|
||||
size_t task_offset = offset_size * task_id; \
|
||||
int count = data_size - task_offset; \
|
||||
if (count <= 0) { \
|
||||
return; \
|
||||
} \
|
||||
count = MSMIN(offset_size, count); \
|
||||
for (size_t idx = task_offset; idx < task_offset + count; ++idx) { \
|
||||
int pos = idx; \
|
||||
int output_idx = 0; \
|
||||
int input_idx = 0; \
|
||||
for (int i = 0; i < num_axes; ++i) { \
|
||||
int position = pos / *(out_strides + i); \
|
||||
int out_stride = i < num_axes - 1 ? out_strides[i] : 1; \
|
||||
output_idx += (position * out_stride); \
|
||||
input_idx += (position * strides[perm[i]]); \
|
||||
pos -= position * (*(out_strides + i)); \
|
||||
} \
|
||||
out_data[output_idx] = in_data[input_idx]; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define DOTRANSPOSE(TYPE, NAME) \
|
||||
|
|
|
@ -40,21 +40,21 @@ int DoTransposeInt64(const int64_t *in_data, int64_t *out_data, const int *outpu
|
|||
TransposeParameter *transpose_param);
|
||||
int DoTransposeBool(const bool *in_data, bool *out_data, const int *output_shape, TransposeParameter *transpose_param);
|
||||
|
||||
void TransposeDimsUInt8(const uint8_t *in_data, uint8_t *out_data, const int *output_shape, int *size, int *position,
|
||||
void TransposeDimsUInt8(const uint8_t *in_data, uint8_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num);
|
||||
void TransposeDimsUInt16(const uint16_t *in_data, uint16_t *out_data, const int *output_shape, int *size, int *position,
|
||||
void TransposeDimsUInt16(const uint16_t *in_data, uint16_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num);
|
||||
void TransposeDimsUInt32(const uint32_t *in_data, uint32_t *out_data, const int *output_shape, int *size, int *position,
|
||||
void TransposeDimsUInt32(const uint32_t *in_data, uint32_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num);
|
||||
void TransposeDimsUInt64(const uint64_t *in_data, uint64_t *out_data, const int *output_shape, int *size, int *position,
|
||||
void TransposeDimsUInt64(const uint64_t *in_data, uint64_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num);
|
||||
void TransposeDimsInt16(const int16_t *in_data, int16_t *out_data, const int *output_shape, int *size, int *position,
|
||||
void TransposeDimsInt16(const int16_t *in_data, int16_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num);
|
||||
void TransposeDimsInt32(const int32_t *in_data, int32_t *out_data, const int *output_shape, int *size, int *position,
|
||||
void TransposeDimsInt32(const int32_t *in_data, int32_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num);
|
||||
void TransposeDimsInt64(const int64_t *in_data, int64_t *out_data, const int *output_shape, int *size, int *position,
|
||||
void TransposeDimsInt64(const int64_t *in_data, int64_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num);
|
||||
void TransposeDimsBool(const bool *in_data, bool *out_data, const int *output_shape, int *size, int *position,
|
||||
void TransposeDimsBool(const bool *in_data, bool *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -159,7 +159,8 @@ void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
|
|||
}
|
||||
}
|
||||
|
||||
void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int channel) {
|
||||
void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int channel, int task_id,
|
||||
int thread_count) {
|
||||
#ifdef ENABLE_ARM64
|
||||
// Transpose16x8 in arm64
|
||||
const int hw_tile = C16NUM;
|
||||
|
@ -167,13 +168,27 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
|
|||
// Transpose8x8 in others
|
||||
const int hw_tile = C8NUM;
|
||||
#endif
|
||||
int hw_align = plane / hw_tile * hw_tile;
|
||||
int hw_align = plane / hw_tile;
|
||||
int task_start = 0;
|
||||
int task_end = plane;
|
||||
if (thread_count > 0) {
|
||||
int offset_hw = UP_DIV(hw_align, thread_count) * hw_tile;
|
||||
task_start = offset_hw * task_id;
|
||||
int count = plane - task_start;
|
||||
if (count <= 0) {
|
||||
return;
|
||||
}
|
||||
task_end = (task_id + 1) == thread_count ? plane : MSMIN(plane, task_start + offset_hw);
|
||||
hw_align = task_start + ((task_end - task_start) >= offset_hw ? offset_hw : 0);
|
||||
} else {
|
||||
hw_align *= hw_tile;
|
||||
}
|
||||
int c8 = channel / C8NUM * C8NUM;
|
||||
int batch = plane * channel;
|
||||
for (int n = 0; n < batches; n++) {
|
||||
const float16_t *src_batch = (const float16_t *)src + n * batch;
|
||||
float16_t *dst_batch = (float16_t *)dst + n * batch;
|
||||
int hw = 0;
|
||||
int hw = task_start;
|
||||
for (; hw < hw_align; hw += hw_tile) {
|
||||
int c = 0;
|
||||
for (; c < c8; c += C8NUM) {
|
||||
|
@ -203,7 +218,7 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
|
|||
}
|
||||
}
|
||||
}
|
||||
for (; hw < plane; hw++) {
|
||||
for (; hw < task_end; hw++) {
|
||||
const float16_t *src_ptr = src_batch + hw * channel;
|
||||
float16_t *dst_ptr = dst_batch + hw;
|
||||
for (size_t i = 0; i < channel; i++) {
|
||||
|
@ -213,8 +228,8 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
|
|||
}
|
||||
}
|
||||
|
||||
void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel) {
|
||||
return PackNHWCToNCHWFp16(src, dst, batch, channel, plane);
|
||||
void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count) {
|
||||
return PackNHWCToNCHWFp16(src, dst, batch, channel, plane, task_id, thread_count);
|
||||
}
|
||||
|
||||
void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
|
||||
|
|
|
@ -37,9 +37,9 @@ void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
|
|||
|
||||
void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel);
|
||||
void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
|
||||
|
||||
void PackNHWCToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel);
|
||||
void PackNHWCToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
|
||||
|
||||
void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel);
|
||||
|
||||
|
|
|
@ -173,38 +173,44 @@ void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, int *strid
|
|||
}
|
||||
}
|
||||
|
||||
void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
|
||||
const int *perm, const int *output_shape, int dims, int *size, int *position) {
|
||||
*(size + dims - 1) = 1;
|
||||
for (int i = dims - 1; i > 0; --i) {
|
||||
*(size + i - 1) = *(size + i) * output_shape[i];
|
||||
void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
|
||||
TransposeParameter *param, int task_id, int thread_num) {
|
||||
int *perm = param->perm_;
|
||||
int *strides = param->strides_;
|
||||
int *out_strides = param->out_strides_;
|
||||
int num_axes = param->num_axes_;
|
||||
size_t data_size = (*out_strides) * output_shape[0];
|
||||
size_t offset_size = UP_DIV(data_size, thread_num);
|
||||
size_t task_offset = offset_size * task_id;
|
||||
int count = data_size - task_offset;
|
||||
if (count <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) {
|
||||
count = MSMIN(offset_size, count);
|
||||
for (size_t idx = task_offset; idx < task_offset + count; ++idx) {
|
||||
int pos = idx;
|
||||
int output_idx = 0;
|
||||
int input_idx = 0;
|
||||
for (int i = 0; i < dims; ++i) {
|
||||
*(position + i) = pos / *(size + i);
|
||||
int out_stride = i < dims - 1 ? out_strides[i] : 1;
|
||||
output_idx += (*(position + i) * out_stride);
|
||||
input_idx += (*(position + i) * strides[perm[i]]);
|
||||
pos -= *(position + i) * (*(size + i));
|
||||
for (int i = 0; i < num_axes; ++i) {
|
||||
int position = pos / *(out_strides + i);
|
||||
int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
|
||||
output_idx += (position * out_stride);
|
||||
input_idx += (position * strides[perm[i]]);
|
||||
pos -= position * (*(out_strides + i));
|
||||
}
|
||||
out_data[output_idx] = in_data[input_idx];
|
||||
}
|
||||
}
|
||||
|
||||
int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int *size, int *position) {
|
||||
int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape, TransposeParameter *param) {
|
||||
if (in_data == NULL || out_data == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
int *perm = transpose_param->perm_;
|
||||
int *strides = transpose_param->strides_;
|
||||
int *out_strides = transpose_param->out_strides_;
|
||||
int data_size = transpose_param->data_size_;
|
||||
int num_axes = transpose_param->num_axes_;
|
||||
int *perm = param->perm_;
|
||||
int *strides = param->strides_;
|
||||
int *out_strides = param->out_strides_;
|
||||
int data_size = param->data_size_;
|
||||
int num_axes = param->num_axes_;
|
||||
|
||||
// check if transpose is needed
|
||||
bool needTranspose = false;
|
||||
|
@ -235,7 +241,7 @@ int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *ou
|
|||
} else if (num_axes == 6) {
|
||||
Fp16TransposeDim6(in_data, out_data, strides, out_strides, perm, output_shape);
|
||||
} else {
|
||||
TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position);
|
||||
return NNACL_ERR;
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -24,8 +24,9 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int *size, int *position);
|
||||
void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
|
||||
TransposeParameter *param, int task_id, int thread_num);
|
||||
int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape, TransposeParameter *param);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -171,13 +171,13 @@ void TransposeDim6Fp32(const float *in_data, float *out_data, const int *strides
|
|||
}
|
||||
}
|
||||
|
||||
void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, const int *size, int *position,
|
||||
void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num) {
|
||||
int *perm = transpose_param->perm_;
|
||||
int *strides = transpose_param->strides_;
|
||||
int *out_strides = transpose_param->out_strides_;
|
||||
int num_axes = transpose_param->num_axes_;
|
||||
size_t data_size = (*size) * output_shape[0];
|
||||
size_t data_size = (*out_strides) * output_shape[0];
|
||||
size_t offset_size = UP_DIV(data_size, thread_num);
|
||||
size_t task_offset = offset_size * task_id;
|
||||
int count = data_size - task_offset;
|
||||
|
@ -190,11 +190,11 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_
|
|||
int output_idx = 0;
|
||||
int input_idx = 0;
|
||||
for (int i = 0; i < num_axes; ++i) {
|
||||
*(position + i) = pos / *(size + i);
|
||||
int position = pos / *(out_strides + i);
|
||||
int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
|
||||
output_idx += (*(position + i) * out_stride);
|
||||
input_idx += (*(position + i) * strides[perm[i]]);
|
||||
pos -= *(position + i) * (*(size + i));
|
||||
output_idx += (position * out_stride);
|
||||
input_idx += (position * strides[perm[i]]);
|
||||
pos -= position * (*(out_strides + i));
|
||||
}
|
||||
out_data[output_idx] = in_data[input_idx];
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *param);
|
||||
void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, const int *size, int *position,
|
||||
void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -220,13 +220,13 @@ int DoTransposeInt8(const int8_t *in_data, int8_t *out_data, const int *output_s
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape, const int *size, int *position,
|
||||
void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num) {
|
||||
int *perm = transpose_param->perm_;
|
||||
int *strides = transpose_param->strides_;
|
||||
int *out_strides = transpose_param->out_strides_;
|
||||
int num_axes = transpose_param->num_axes_;
|
||||
size_t data_size = (*size) * output_shape[0];
|
||||
size_t data_size = (*out_strides) * output_shape[0];
|
||||
size_t offset_size = UP_DIV(data_size, thread_num);
|
||||
size_t task_offset = offset_size * task_id;
|
||||
int count = data_size - task_offset;
|
||||
|
@ -239,11 +239,11 @@ void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *outpu
|
|||
int output_idx = 0;
|
||||
int input_idx = 0;
|
||||
for (int i = 0; i < num_axes; ++i) {
|
||||
*(position + i) = pos / *(size + i);
|
||||
int position = pos / *(out_strides + i);
|
||||
int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
|
||||
output_idx += (*(position + i) * out_stride);
|
||||
input_idx += (*(position + i) * strides[perm[i]]);
|
||||
pos -= *(position + i) * (*(size + i));
|
||||
output_idx += (position * out_stride);
|
||||
input_idx += (position * strides[perm[i]]);
|
||||
pos -= position * (*(out_strides + i));
|
||||
}
|
||||
out_data[output_idx] = in_data[input_idx];
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ extern "C" {
|
|||
|
||||
int DoTransposeInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param);
|
||||
void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape, const int *size, int *position,
|
||||
void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
|
||||
TransposeParameter *transpose_param, int task_id, int thread_num);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -127,18 +127,9 @@ void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, con
|
|||
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
|
||||
const float block_size = 128.0;
|
||||
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
|
||||
int dims = SizeToInt(axes_.size());
|
||||
int *size = new int[dims];
|
||||
size[dims - 1] = 1;
|
||||
for (int i = dims - 1; i > 0; i--) {
|
||||
size[i - 1] = size[i] * output_shape_[i];
|
||||
}
|
||||
int **position = new int *[thread_num];
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
position[i] = new int[dims];
|
||||
}
|
||||
std::vector<common::Task> tasks;
|
||||
std::function<void(const T *, T *, const int *, int *, int *, TransposeParameter *, int, int)> TransposeDims;
|
||||
std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims;
|
||||
|
||||
if constexpr (std::is_same_v<T, int8_t>) {
|
||||
TransposeDims = &TransposeDimsInt8;
|
||||
} else if constexpr (std::is_same_v<T, int16_t>) {
|
||||
|
@ -162,15 +153,12 @@ void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, con
|
|||
}
|
||||
for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) {
|
||||
auto task = [&, task_id, thread_num]() {
|
||||
TransposeDims(input_addr, output_addr, output_shape, size, position[task_id], &transpose_param_, task_id,
|
||||
SizeToInt(thread_num));
|
||||
TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num));
|
||||
return common::SUCCESS;
|
||||
};
|
||||
tasks.emplace_back(task);
|
||||
}
|
||||
common::ThreadPool::GetInstance().SyncRun(tasks);
|
||||
delete[] size;
|
||||
delete[] position;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -44,7 +44,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
PackNCHWToNHWCFp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
|
||||
weight_tensor->Batch());
|
||||
weight_tensor->Batch(), 0, 0);
|
||||
|
||||
bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t)));
|
||||
if (bias_data_ == nullptr) {
|
||||
|
|
|
@ -13,9 +13,9 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16/transpose_fp16.h"
|
||||
#include <vector>
|
||||
#include "nnacl/fp16/pack_fp16.h"
|
||||
#include "nnacl/fp16/transpose_fp16.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
|
@ -30,63 +30,19 @@ using mindspore::lite::RET_OP_EXECUTE_FAILURE;
|
|||
using mindspore::schema::PrimitiveType_Transpose;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int TransposeFp16CPUKernel::Init() {
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
return TransposeCPUKernel::ReSize();
|
||||
void TransposeFp16CPUKernel::GetNchwToNhwcFunc() { NHNCTransposeFunc_ = PackNCHWToNHWCFp16; }
|
||||
|
||||
void TransposeFp16CPUKernel::GetNhwcToNchwFunc() { NHNCTransposeFunc_ = PackNHWCToNCHWFp16; }
|
||||
|
||||
int TransposeFp16CPUKernel::TransposeDim2to6() {
|
||||
return DoTransposeFp16(static_cast<const float16_t *>(in_data_), static_cast<float16_t *>(out_data_), out_shape_,
|
||||
param_);
|
||||
}
|
||||
|
||||
int TransposeFp16CPUKernel::Run() {
|
||||
MS_ASSERT(in_tensors_.size() == 1 || in_tensors_.size() == 2);
|
||||
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
|
||||
param->data_size_ = in_tensors_[0]->Size();
|
||||
MS_ASSERT(out_tensors_.size() == 1);
|
||||
auto &in_tensor = in_tensors_.front();
|
||||
auto &out_tensor = out_tensors_.front();
|
||||
if (in_tensor == nullptr || out_tensor == nullptr) {
|
||||
MS_LOG(ERROR) << "null pointer referencing.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
in_data_fp16_ = reinterpret_cast<float16_t *>(in_tensor->MutableData());
|
||||
out_data_fp16_ = reinterpret_cast<float16_t *>(out_tensor->MutableData());
|
||||
MS_ASSERT(in_data_fp16_);
|
||||
MS_ASSERT(out_data_fp16_);
|
||||
|
||||
if (in_tensor->shape().size() != static_cast<size_t>(param->num_axes_)) {
|
||||
memcpy(out_data_fp16_, in_data_fp16_, in_tensor->ElementsNum() * sizeof(float16_t));
|
||||
return RET_OK;
|
||||
}
|
||||
int dims = out_tensor->shape().size();
|
||||
if (dims > DIMENSION_6D) {
|
||||
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
|
||||
if (dim_size_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc data failed";
|
||||
return RET_ERROR;
|
||||
}
|
||||
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
|
||||
if (position_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc data failed";
|
||||
context_->allocator->Free(dim_size_);
|
||||
dim_size_ = nullptr;
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
MS_ASSERT(out_shape_);
|
||||
auto ret = Fp16DoTranspose(in_data_fp16_, out_data_fp16_, out_shape_, param, dim_size_, position_);
|
||||
if (dims > DIMENSION_6D) {
|
||||
context_->allocator->Free(dim_size_);
|
||||
context_->allocator->Free(position_);
|
||||
dim_size_ = nullptr;
|
||||
position_ = nullptr;
|
||||
}
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Transpose run failed";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
return ret;
|
||||
int TransposeFp16CPUKernel::TransposeDimGreaterThan6(int task_id) {
|
||||
TransposeDimsFp16(static_cast<const float16_t *>(in_data_), static_cast<float16_t *>(out_data_), out_shape_, param_,
|
||||
task_id, op_parameter_->thread_num_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Transpose, LiteKernelCreator<TransposeFp16CPUKernel>)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -13,7 +13,6 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP16_TRANSPOSE_FP16_H_
|
||||
#define MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP16_TRANSPOSE_FP16_H_
|
||||
|
||||
|
@ -32,12 +31,11 @@ class TransposeFp16CPUKernel : public TransposeCPUKernel {
|
|||
: TransposeCPUKernel(param, inputs, outputs, ctx) {}
|
||||
~TransposeFp16CPUKernel() = default;
|
||||
|
||||
int Init() override;
|
||||
int Run() override;
|
||||
|
||||
private:
|
||||
float16_t *in_data_fp16_ = nullptr;
|
||||
float16_t *out_data_fp16_ = nullptr;
|
||||
void GetNchwToNhwcFunc() override;
|
||||
void GetNhwcToNchwFunc() override;
|
||||
int TransposeDim2to6() override;
|
||||
int TransposeDimGreaterThan6(int task_id) override;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp32/transpose_fp32.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
|
@ -36,16 +35,15 @@ int TransposeCPUKernel::Init() {
|
|||
}
|
||||
|
||||
int TransposeCPUKernel::ReSize() {
|
||||
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(op_parameter_);
|
||||
if (in_tensors_.size() == 2) {
|
||||
param->num_axes_ = in_tensors_.at(1)->ElementsNum();
|
||||
param_->num_axes_ = in_tensors_.at(1)->ElementsNum();
|
||||
}
|
||||
int trans3d[3] = {0, 2, 1};
|
||||
int *perm_data = nullptr;
|
||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||
if (input_tensor->shape().size() != static_cast<size_t>(param->num_axes_)) {
|
||||
if (input_tensor->shape().size() == 3 && param->num_axes_ == 4) {
|
||||
param->num_axes_ = 3;
|
||||
if (input_tensor->shape().size() != static_cast<size_t>(param_->num_axes_)) {
|
||||
if (input_tensor->shape().size() == 3 && param_->num_axes_ == 4) {
|
||||
param_->num_axes_ = 3;
|
||||
perm_data = trans3d;
|
||||
} else {
|
||||
return RET_OK;
|
||||
|
@ -55,21 +53,19 @@ int TransposeCPUKernel::ReSize() {
|
|||
auto perm_tensor = in_tensors_.at(1);
|
||||
perm_data = reinterpret_cast<int *>(perm_tensor->data_c());
|
||||
}
|
||||
// set perm data
|
||||
MS_ASSERT(perm_data != nullptr);
|
||||
for (int i = 0; i < param->num_axes_; ++i) {
|
||||
param->perm_[i] = perm_data[i];
|
||||
for (int i = 0; i < param_->num_axes_; ++i) {
|
||||
param_->perm_[i] = perm_data[i];
|
||||
}
|
||||
auto &inTensor = in_tensors_.front();
|
||||
auto &outTensor = out_tensors_.front();
|
||||
auto in_shape = inTensor->shape();
|
||||
auto out_shape = outTensor->shape();
|
||||
param->strides_[param->num_axes_ - 1] = 1;
|
||||
param->out_strides_[param->num_axes_ - 1] = 1;
|
||||
param->data_size_ = inTensor->Size();
|
||||
for (int i = param->num_axes_ - 2; i >= 0; i--) {
|
||||
param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1];
|
||||
param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1];
|
||||
param_->strides_[param_->num_axes_ - 1] = 1;
|
||||
param_->out_strides_[param_->num_axes_ - 1] = 1;
|
||||
param_->data_size_ = inTensor->Size();
|
||||
for (int i = param_->num_axes_ - 2; i >= 0; i--) {
|
||||
param_->strides_[i] = in_shape.at(i + 1) * param_->strides_[i + 1];
|
||||
param_->out_strides_[i] = out_shape.at(i + 1) * param_->out_strides_[i + 1];
|
||||
}
|
||||
|
||||
if (this->out_shape_ != nullptr) {
|
||||
|
@ -92,35 +88,49 @@ TransposeCPUKernel::~TransposeCPUKernel() {
|
|||
}
|
||||
}
|
||||
|
||||
void TransposeCPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor,
|
||||
TransposeParameter *param) {
|
||||
void TransposeCPUKernel::GetNchwToNhwcFunc() { NHNCTransposeFunc_ = PackNCHWToNHWCFp32; }
|
||||
|
||||
void TransposeCPUKernel::GetNhwcToNchwFunc() { NHNCTransposeFunc_ = PackNHWCToNCHWFp32; }
|
||||
|
||||
int TransposeCPUKernel::TransposeDim2to6() {
|
||||
return DoTransposeFp32(static_cast<const float *>(in_data_), static_cast<float *>(out_data_), out_shape_, param_);
|
||||
}
|
||||
|
||||
int TransposeCPUKernel::TransposeDimGreaterThan6(int task_id) {
|
||||
TransposeDimsFp32(static_cast<const float *>(in_data_), static_cast<float *>(out_data_), out_shape_, param_, task_id,
|
||||
op_parameter_->thread_num_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void TransposeCPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor) {
|
||||
if (in_tensor->shape().size() != 4) {
|
||||
return;
|
||||
}
|
||||
auto out_shape = out_tensor->shape();
|
||||
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 &&
|
||||
param->perm_[3] == 1) {
|
||||
if (param_->perm_[0] == 0 && param_->perm_[1] == 2 && param_->perm_[2] == 3 && param_->perm_[3] == 1) {
|
||||
nhnc_param_[0] = out_shape[0];
|
||||
nhnc_param_[1] = out_shape[1] * out_shape[2];
|
||||
nhnc_param_[2] = out_shape[3];
|
||||
if (in_tensor->data_type() == kNumberTypeFloat32) {
|
||||
NHNCTransposeFunc_ = PackNCHWToNHWCFp32;
|
||||
GetNchwToNhwcFunc();
|
||||
}
|
||||
}
|
||||
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
|
||||
param->perm_[3] == 2) {
|
||||
if (param_->perm_[0] == 0 && param_->perm_[1] == 3 && param_->perm_[2] == 1 && param_->perm_[3] == 2) {
|
||||
nhnc_param_[0] = out_shape[0];
|
||||
nhnc_param_[1] = out_shape[2] * out_shape[3];
|
||||
nhnc_param_[2] = out_shape[1];
|
||||
if (in_tensor->data_type() == kNumberTypeFloat32) {
|
||||
NHNCTransposeFunc_ = PackNHWCToNCHWFp32;
|
||||
GetNhwcToNchwFunc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int TransposeCPUKernel::RunImpl(int task_id) {
|
||||
if (NHNCTransposeFunc_ != nullptr) {
|
||||
NHNCTransposeFunc_(in_data_, out_data_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2], task_id, thread_count_);
|
||||
NHNCTransposeFunc_(in_data_, out_data_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2], task_id,
|
||||
op_parameter_->thread_num_);
|
||||
} else {
|
||||
TransposeDimsFp32(in_data_, out_data_, out_shape_, dim_size_, position_ + dims_ * task_id, param_, task_id,
|
||||
thread_count_);
|
||||
return TransposeDimGreaterThan6(task_id);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -143,63 +153,26 @@ int TransposeCPUKernel::Run() {
|
|||
MS_LOG(ERROR) << "null pointer dreferencing.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
in_data_ = reinterpret_cast<float *>(in_tensor->MutableData());
|
||||
out_data_ = reinterpret_cast<float *>(out_tensor->MutableData());
|
||||
in_data_ = in_tensor->data_c();
|
||||
out_data_ = out_tensor->data_c();
|
||||
MS_ASSERT(in_data_);
|
||||
MS_ASSERT(out_data_);
|
||||
|
||||
param_ = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
|
||||
if (in_tensor->shape().size() != static_cast<size_t>(param_->num_axes_)) {
|
||||
memcpy(out_data_, in_data_, in_tensor->ElementsNum() * sizeof(float));
|
||||
memcpy(out_data_, in_data_, in_tensor->Size());
|
||||
return RET_OK;
|
||||
}
|
||||
thread_count_ = op_parameter_->thread_num_;
|
||||
GetNHNCTransposeFunc(in_tensor, out_tensor, param_);
|
||||
GetNHNCTransposeFunc(in_tensor, out_tensor);
|
||||
if (NHNCTransposeFunc_ != nullptr) {
|
||||
auto ret = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(TransposeImpl, this, thread_count_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "NHNCTransposeFunc_ is error!";
|
||||
}
|
||||
return ret;
|
||||
return static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(TransposeImpl, this, op_parameter_->thread_num_);
|
||||
}
|
||||
|
||||
MS_ASSERT(out_shape_);
|
||||
dims_ = out_tensor->shape().size();
|
||||
if (dims_ > DIMENSION_6D) {
|
||||
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int)));
|
||||
if (dim_size_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc data failed";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
*(dim_size_ + dims_ - 1) = 1;
|
||||
for (int i = dims_ - 1; i > 0; --i) {
|
||||
*(dim_size_ + i - 1) = *(dim_size_ + i) * out_shape_[i];
|
||||
}
|
||||
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int) * thread_count_));
|
||||
if (position_ == nullptr) {
|
||||
context_->allocator->Free(dim_size_);
|
||||
MS_LOG(ERROR) << "Malloc data failed";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
}
|
||||
int ret;
|
||||
if (dims_ > DIMENSION_6D) {
|
||||
ret = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(TransposeImpl, this, thread_count_);
|
||||
if (out_tensor->shape().size() <= DIMENSION_6D) {
|
||||
return TransposeDim2to6();
|
||||
} else {
|
||||
ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param_);
|
||||
return static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(TransposeImpl, this, op_parameter_->thread_num_);
|
||||
}
|
||||
if (dims_ > DIMENSION_6D) {
|
||||
context_->allocator->Free(dim_size_);
|
||||
context_->allocator->Free(position_);
|
||||
dim_size_ = nullptr;
|
||||
position_ = nullptr;
|
||||
}
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Transpose run failed";
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -13,7 +13,6 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_H_
|
||||
#define MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_H_
|
||||
|
||||
|
@ -33,7 +32,9 @@ class TransposeCPUKernel : public InnerKernel {
|
|||
public:
|
||||
explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(param, inputs, outputs, ctx) {}
|
||||
: InnerKernel(param, inputs, outputs, ctx) {
|
||||
param_ = reinterpret_cast<TransposeParameter *>(param);
|
||||
}
|
||||
~TransposeCPUKernel() override;
|
||||
|
||||
int Init() override;
|
||||
|
@ -42,17 +43,18 @@ class TransposeCPUKernel : public InnerKernel {
|
|||
int RunImpl(int task_id);
|
||||
|
||||
protected:
|
||||
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
|
||||
float *in_data_ = nullptr;
|
||||
float *out_data_ = nullptr;
|
||||
virtual void GetNchwToNhwcFunc();
|
||||
virtual void GetNhwcToNchwFunc();
|
||||
virtual int TransposeDim2to6();
|
||||
virtual int TransposeDimGreaterThan6(int task_id);
|
||||
|
||||
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor);
|
||||
void *in_data_ = nullptr;
|
||||
void *out_data_ = nullptr;
|
||||
int *out_shape_ = nullptr;
|
||||
int *dim_size_ = nullptr;
|
||||
int *position_ = nullptr;
|
||||
TransposeParameter *param_ = nullptr;
|
||||
TransposeFunc NHNCTransposeFunc_ = nullptr;
|
||||
int thread_count_ = 0;
|
||||
int nhnc_param_[3] = {0};
|
||||
int dims_ = 0;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -41,47 +41,6 @@ int TransposeInt8Run(void *cdata, int task_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void TransposeInt8CPUKernel::FreeTmpBuf() {
|
||||
if (!extra_dims_) {
|
||||
return;
|
||||
}
|
||||
if (dim_size_ != nullptr) {
|
||||
context_->allocator->Free(dim_size_);
|
||||
dim_size_ = nullptr;
|
||||
}
|
||||
if (position_ != nullptr) {
|
||||
context_->allocator->Free(position_);
|
||||
position_ = nullptr;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int TransposeInt8CPUKernel::MallocTmpBuf() {
|
||||
if (!extra_dims_) {
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int dims = out_tensors_.at(0)->shape().size();
|
||||
|
||||
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
|
||||
if (dim_size_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc data failed";
|
||||
return RET_ERROR;
|
||||
}
|
||||
*(dim_size_ + dims - 1) = 1;
|
||||
for (int i = dims - 1; i > 0; --i) {
|
||||
*(dim_size_ + i - 1) = *(dim_size_ + i) * out_shape_[i];
|
||||
}
|
||||
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int) * op_parameter_->thread_num_));
|
||||
if (position_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc data failed";
|
||||
context_->allocator->Free(dim_size_);
|
||||
dim_size_ = nullptr;
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int TransposeInt8CPUKernel::ReSize() {
|
||||
auto in_tensor = in_tensors_.front();
|
||||
auto out_tensor = out_tensors_.front();
|
||||
|
@ -105,20 +64,16 @@ int TransposeInt8CPUKernel::ReSize() {
|
|||
transpose_param_->strides_[i] = in_shape.at(i + 1) * transpose_param_->strides_[i + 1];
|
||||
transpose_param_->out_strides_[i] = out_shape.at(i + 1) * transpose_param_->out_strides_[i + 1];
|
||||
}
|
||||
|
||||
extra_dims_ = out_shape.size() > DIMENSION_6D;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int TransposeInt8CPUKernel::DoTranspose(int task_id) {
|
||||
int dims = out_tensors_.at(0)->shape().size();
|
||||
MS_ASSERT(in_ptr_);
|
||||
MS_ASSERT(out_ptr_);
|
||||
MS_ASSERT(in_shape_);
|
||||
MS_ASSERT(out_shape_);
|
||||
MS_ASSERT(transpose_param_);
|
||||
TransposeDimsInt8(in_ptr_, out_ptr_, out_shape_, dim_size_, position_ + dims * task_id, transpose_param_, task_id,
|
||||
op_parameter_->thread_num_);
|
||||
TransposeDimsInt8(in_ptr_, out_ptr_, out_shape_, transpose_param_, task_id, op_parameter_->thread_num_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -158,22 +113,12 @@ int TransposeInt8CPUKernel::Run() {
|
|||
memcpy(in_shape_, in_dims.data(), in_dims.size() * sizeof(int));
|
||||
memcpy(out_shape_, out_dims.data(), out_dims.size() * sizeof(int));
|
||||
|
||||
int ret = MallocTmpBuf();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "MallocTmpBuf error_code[" << ret << "]";
|
||||
}
|
||||
if (extra_dims_) {
|
||||
ret = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(TransposeInt8Run, this, op_parameter_->thread_num_);
|
||||
if (out_tensor->shape().size() > DIMENSION_6D) {
|
||||
return static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(TransposeInt8Run, this, op_parameter_->thread_num_);
|
||||
} else {
|
||||
ret = DoTransposeInt8(in_ptr_, out_ptr_, out_shape_, transpose_param_);
|
||||
return DoTransposeInt8(in_ptr_, out_ptr_, out_shape_, transpose_param_);
|
||||
}
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]";
|
||||
}
|
||||
|
||||
FreeTmpBuf();
|
||||
return ret;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Transpose, LiteKernelCreator<TransposeInt8CPUKernel>)
|
||||
|
|
|
@ -43,19 +43,12 @@ class TransposeInt8CPUKernel : public InnerKernel {
|
|||
public:
|
||||
int DoTranspose(int task_id);
|
||||
|
||||
private:
|
||||
int MallocTmpBuf();
|
||||
void FreeTmpBuf();
|
||||
|
||||
private:
|
||||
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
|
||||
TransposeParameter *transpose_param_;
|
||||
TransposeFunc NHNCTransposeFunc_ = nullptr;
|
||||
int8_t *in_ptr_ = nullptr;
|
||||
int8_t *out_ptr_ = nullptr;
|
||||
int *dim_size_ = nullptr;
|
||||
int *position_ = nullptr;
|
||||
bool extra_dims_ = false;
|
||||
int in_shape_[20] = {0};
|
||||
int out_shape_[20] = {0};
|
||||
int nhnc_param_[3] = {0};
|
||||
|
|
Loading…
Reference in New Issue