fix bug of fp16

This commit is contained in:
mengyuanli 2021-07-12 18:09:44 +08:00
parent bda4c5b473
commit 45e621388b
5 changed files with 65 additions and 288 deletions

View File

@ -83,6 +83,16 @@ void LiteOpActor::IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *ac
Tensor *old_tensor = kernel_->in_tensors()[i];
if (OfflineIsolated(kernels, *kernel_, *old_tensor)) {
if (old_tensor->data_type() == kNumberTypeFloat16 || old_tensor->data_type() == kNumberTypeFloat32) {
old_tensor->set_data_type(kernel_->desc().data_type);
}
if (old_tensor->data_type() == kObjectTypeTensorType) {
auto old_tensorlist = reinterpret_cast<TensorList *>(old_tensor);
if (old_tensorlist->tensors_data_type() == kNumberTypeFloat16 ||
old_tensorlist->tensors_data_type() == kNumberTypeFloat32) {
old_tensorlist->set_tensors_data_type(kernel_->desc().data_type);
}
}
continue;
}
@ -313,7 +323,31 @@ void LiteOpActor::CopyInputData(Tensor *dst_tensor, Tensor *src_tensor) {
}
int LiteOpActor::CastInputData(Tensor *dst, Tensor *src) {
int ret = RET_OK;
dst->ResetRefCount();
if (src->data_type() != kObjectTypeTensorType) {
ret = CastTensorInputData(dst, src);
} else {
ret = CastTensorListInputData(reinterpret_cast<TensorList *>(dst), reinterpret_cast<TensorList *>(src));
}
src->DecRefCount();
return ret;
}
bool LiteOpActor::NeedCastData(Tensor *dst_tensor, Tensor *src_tensor) {
if (dst_tensor->data_type() != kObjectTypeTensorType && src_tensor->data_type() != kObjectTypeTensorType &&
dst_tensor->data_type() != src_tensor->data_type()) {
return true;
}
if (dst_tensor->data_type() == kObjectTypeTensorType && src_tensor->data_type() == kObjectTypeTensorType &&
reinterpret_cast<TensorList *>(dst_tensor)->tensors_data_type() !=
reinterpret_cast<TensorList *>(src_tensor)->tensors_data_type()) {
return true;
}
return false;
}
int LiteOpActor::CastTensorInputData(Tensor *dst, Tensor *src) {
dst->MallocData();
#if defined(ENABLE_ARM) && defined(ENABLE_FP16)
if (dst->shape() != src->shape()) {
@ -335,13 +369,37 @@ int LiteOpActor::CastInputData(Tensor *dst, Tensor *src) {
MS_LOG(ERROR) << "not support dst_data_type: " << dst_data_type << " src_data_type: " << src_data_type;
return RET_NOT_SUPPORT;
}
src->DecRefCount();
return RET_OK;
#endif
src->DecRefCount();
return RET_ERROR;
}
int LiteOpActor::CastTensorListInputData(TensorList *dst_tensorlist, TensorList *src_tensorlist) {
MS_ASSERT(src_tensorlist != nullptr);
MS_ASSERT(dst_tensorlist != nullptr);
dst_tensorlist->set_shape(src_tensorlist->shape());
std::vector<std::vector<int>> tensors_shapes{};
tensors_shapes.resize(src_tensorlist->tensors().size());
for (size_t i = 0; i < tensors_shapes.size(); ++i) {
tensors_shapes[i] = src_tensorlist->tensors()[i]->shape();
}
if (src_tensorlist->tensors_data_type() == kNumberTypeFloat16) {
dst_tensorlist->MallocTensorListData(kNumberTypeFloat32, tensors_shapes);
}
if (src_tensorlist->tensors_data_type() == kNumberTypeFloat32) {
dst_tensorlist->MallocTensorListData(kNumberTypeFloat16, tensors_shapes);
}
dst_tensorlist->ResetRefCount();
dst_tensorlist->set_allocator(src_tensorlist->allocator());
for (size_t i = 0; i < src_tensorlist->tensors().size(); ++i) {
auto &src_tensor = src_tensorlist->tensors()[i];
auto &dst_tensor = dst_tensorlist->tensors()[i];
CastTensorInputData(dst_tensor, src_tensor);
}
return RET_OK;
}
void LiteOpActor::SetInputShape() {
for (size_t i = 0; i < inputs_data_.size(); ++i) {
auto &input_tensor = kernel_->in_tensors()[i];
@ -380,7 +438,7 @@ int LiteOpActor::SetInputData() {
continue;
}
if (src_tensor->data_type() != dst_tensor->data_type()) {
if (NeedCastData(dst_tensor, src_tensor)) {
CastInputData(dst_tensor, src_tensor);
} else if (src_tensor->allocator() == nullptr && !(src_tensor->IsConst()) && !(src_tensor->IsGraphInput())) {
// delegate graph kernel output tensor

View File

@ -99,6 +99,9 @@ class LiteOpActor : public OpActor<lite::Tensor> {
void MoveInputData(Tensor *dst_tensor, Tensor *src_tensor);
void CopyInputData(Tensor *dst_tensor, Tensor *src_tensor);
int CastInputData(Tensor *dst_tensor, Tensor *src_tensor);
bool NeedCastData(Tensor *dst_tensor, Tensor *src_tensor);
int CastTensorInputData(Tensor *dst_tensor, Tensor *src_tensor);
int CastTensorListInputData(TensorList *dst_tensor, TensorList *src_tensor);
private:
kernel::LiteKernel *partial_node_ = nullptr;

View File

@ -240,228 +240,4 @@ int CpuSubGraph::Execute(const KernelCallBack &before, const KernelCallBack &aft
}
return RET_OK;
}
#if defined(ENABLE_ARM) && defined(ENABLE_FP16)
void CpuFp16SubGraph::FreeOriginInputData() {
for (auto &iter : this->origin_input_data_) {
auto *data_store = iter.second;
if (data_store == nullptr) {
continue;
}
// free data in data_store
if (data_store->data_ != nullptr) {
if (data_store->allocator_ == nullptr) {
free(data_store->data_);
} else {
data_store->allocator_->Free(data_store->data_);
}
}
// free data_store
if (this->Context()->allocator != nullptr) {
this->Context()->allocator->Free(data_store);
} else {
free(data_store);
}
data_store = nullptr;
}
this->origin_input_data_.clear();
}
int CpuFp16SubGraph::Float32TensorToFloat16Tensor(lite::Tensor *tensor) {
MS_ASSERT(tensor != nullptr);
auto float32_data = tensor->data_c();
auto own_data = tensor->own_data();
tensor->set_data_type(TypeId::kNumberTypeFloat16);
if (float32_data == nullptr) {
// the input data may be nullptr of merge.
MS_LOG(INFO) << "tensor data is null.";
return lite::RET_OK;
}
tensor->set_data(nullptr);
auto ret = tensor->MallocData();
if (ret != RET_OK) {
MS_LOG(ERROR) << "malloc data failed";
return RET_ERROR;
}
MS_ASSERT(tensor->data_c() != nullptr);
Float32ToFloat16_fp16_handler(float32_data, tensor->data_c(), tensor->ElementsNum(), support_fp16_);
if (tensor->allocator() != nullptr) {
tensor->allocator()->SetRefCount(tensor->data_c(), tensor->allocator()->RefCount(float32_data));
}
auto *data_store =
DataStore::CreateDataStore(float32_data, own_data, tensor->allocator().get(), this->Context()->allocator.get());
if (data_store == nullptr) {
MS_LOG(ERROR) << "Create DataStore failed";
return RET_ERROR;
}
origin_input_data_[tensor] = data_store;
return RET_OK;
}
int CpuFp16SubGraph::Float16TensorToFloat32Tensor(lite::Tensor *tensor) {
auto float16_data = tensor->data_c();
if (float16_data == nullptr) {
MS_LOG(ERROR) << "tensor data is null.";
return lite::RET_NULL_PTR;
}
tensor->set_data(nullptr);
tensor->set_data_type(TypeId::kNumberTypeFloat32);
auto ret = tensor->MallocData();
if (ret != RET_OK) {
MS_LOG(ERROR) << "malloc data failed";
if (this->Context() != nullptr && this->Context()->allocator != nullptr) {
this->Context()->allocator->Free(float16_data);
} else {
free(float16_data);
}
return RET_ERROR;
}
MS_ASSERT(tensor->data_c() != nullptr);
Float16ToFloat32_fp16_handler(float16_data, tensor->data_c(), tensor->ElementsNum(), support_fp16_);
if (tensor->allocator() != nullptr) {
tensor->allocator()->SetRefCount(tensor->data_c(), tensor->allocator()->RefCount(float16_data));
tensor->allocator()->Free(float16_data);
} else {
free(float16_data);
}
return RET_OK;
}
int CpuFp16SubGraph::PreProcess() {
#ifdef ENABLE_FP16
int ret;
for (auto tensor : this->in_tensors()) {
MS_ASSERT(tensor != nullptr);
auto real_tensor = tensor;
if (tensor->root_tensor() != nullptr) {
real_tensor = tensor->root_tensor();
if (tensor->data_type() == kNumberTypeFloat32) {
tensor->set_data_type(kNumberTypeFloat16);
} else if (tensor->data_type() == kObjectTypeTensorType) {
auto tensorlist = reinterpret_cast<lite::TensorList *>(tensor);
if (tensorlist->tensors_data_type() == kNumberTypeFloat32) {
tensorlist->set_tensors_data_type(kNumberTypeFloat16);
}
}
}
if (real_tensor->data_type() == kNumberTypeFloat32) {
ret = Float32TensorToFloat16Tensor(real_tensor);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Float32TensorToFloat16Tensor failed.";
this->FreeOriginInputData();
return ret;
}
} else if (real_tensor->data_type() == kObjectTypeTensorType) {
auto tensorlist = reinterpret_cast<lite::TensorList *>(real_tensor);
if (tensorlist->tensors_data_type() == kNumberTypeFloat32) {
tensorlist->set_tensors_data_type(kNumberTypeFloat16);
for (auto inner_tensor : tensorlist->tensors()) {
ret = Float32TensorToFloat16Tensor(inner_tensor);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Float32TensorToFloat16Tensor failed.";
this->FreeOriginInputData();
return ret;
}
}
}
}
}
for (auto kernel : this->nodes_) {
for (auto tensor : kernel->out_tensors()) {
if (kernel->type() == schema::PrimitiveType_Cast) {
continue;
}
if (tensor->data_type() == kNumberTypeFloat32) {
tensor->set_data_type(kNumberTypeFloat16);
} else if (tensor->data_type() == kObjectTypeTensorType) {
auto tensorlist = reinterpret_cast<lite::TensorList *>(tensor);
if (tensorlist->tensors_data_type() == kNumberTypeFloat32) {
tensorlist->set_tensors_data_type(kNumberTypeFloat16);
}
}
}
}
return RET_OK;
#else
return RET_OK;
#endif
}
int CpuFp16SubGraph::PostProcess() {
#ifdef ENABLE_FP16
int ret;
for (auto tensor : this->out_tensors()) {
MS_ASSERT(tensor != nullptr);
if (tensor->data_type() == kNumberTypeFloat16) {
ret = Float16TensorToFloat32Tensor(tensor);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Float16TensorToFloat32Tensor failed.";
return ret;
}
} else if (tensor->data_type() == kObjectTypeTensorType) {
auto tensorlist = reinterpret_cast<lite::TensorList *>(tensor);
if (tensorlist->tensors_data_type() == kNumberTypeFloat16) {
tensorlist->set_tensors_data_type(kNumberTypeFloat32);
for (auto inner_tensor : tensorlist->tensors()) {
ret = Float16TensorToFloat32Tensor(inner_tensor);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Float32TensorToFloat16Tensor failed.";
return ret;
}
}
}
}
}
int tensor_count = 0;
auto in_tensors = this->in_tensors();
for (size_t i = 0; i < in_tensors.size(); i++) {
auto tensor = in_tensors.at(i);
MS_ASSERT(tensor != nullptr);
auto real_tensor = tensor;
if (tensor->root_tensor() != nullptr) {
real_tensor = tensor->root_tensor();
if (tensor->data_type() == kNumberTypeFloat16) {
tensor->set_data_type(kNumberTypeFloat32);
} else if (tensor->data_type() == kObjectTypeTensorType) {
auto tensorlist = reinterpret_cast<lite::TensorList *>(tensor);
if (tensorlist->tensors_data_type() == kNumberTypeFloat16) {
tensorlist->set_tensors_data_type(kNumberTypeFloat32);
}
}
}
if (real_tensor->data_type() == kNumberTypeFloat16 &&
origin_input_data_.find(real_tensor) != origin_input_data_.end()) {
auto origin_tensor_data = origin_input_data_.at(real_tensor);
real_tensor->FreeData();
MS_ASSERT(origin_tensor_data->data_ != nullptr);
real_tensor->set_data(origin_tensor_data->data_);
real_tensor->set_own_data(origin_tensor_data->own_data_);
real_tensor->set_data_type(kNumberTypeFloat32);
origin_tensor_data->data_ = nullptr;
tensor_count++;
} else if (real_tensor->data_type() == kObjectTypeTensorType) {
auto tensorlist = reinterpret_cast<lite::TensorList *>(real_tensor);
if (tensorlist->tensors_data_type() == kNumberTypeFloat16) {
tensorlist->set_tensors_data_type(kNumberTypeFloat32);
for (auto inner_tensor : tensorlist->tensors()) {
MS_ASSERT(inner_tensor != nullptr);
auto origin_tensor_data = origin_input_data_.at(inner_tensor);
inner_tensor->FreeData();
MS_ASSERT(origin_tensor_data->data_ != nullptr);
inner_tensor->set_data(origin_tensor_data->data_);
inner_tensor->set_own_data(origin_tensor_data->own_data_);
inner_tensor->set_data_type(kNumberTypeFloat32);
origin_tensor_data->data_ = nullptr;
tensor_count++;
}
}
}
}
this->FreeOriginInputData();
return RET_OK;
#else
return RET_OK;
#endif
}
#endif
} // namespace mindspore::kernel

View File

@ -175,67 +175,7 @@ class CpuFp16SubGraph : public CpuSubGraph {
return CpuSubGraph::Init();
}
int PreProcess();
int Execute() override {
auto ret = PreProcess();
if (lite::RET_OK != ret) {
MS_LOG(ERROR) << "run kernel PreProcess failed, name: " << this->name();
return ret;
}
ret = CpuSubGraph::Execute();
if (lite::RET_OK != ret) {
MS_LOG(ERROR) << "run kernel failed, name: " << this->name();
return ret;
}
ret = PostProcess();
if (lite::RET_OK != ret) {
MS_LOG(ERROR) << "run kernel PreProcess failed, name: " << this->name();
return ret;
}
return lite::RET_OK;
}
int Execute(const KernelCallBack &before, const KernelCallBack &after) override {
auto ret = PreProcess();
if (lite::RET_OK != ret) {
MS_LOG(ERROR) << "run kernel PreProcess failed, name: " << this->name();
return ret;
}
#ifdef Debug
for (const auto *node : nodes_) {
if (node->type() == schema::PrimitiveType_PartialFusion) {
continue;
}
for (const auto *in_tensor : node->in_tensors()) {
if (in_tensor->data_type() == kNumberTypeFloat32) {
MS_LOG(ERROR) << "FP16 kernel can not accept float32 input";
return lite::RET_ERROR;
}
}
}
#endif
ret = CpuSubGraph::Execute(before, after);
if (lite::RET_OK != ret) {
MS_LOG(ERROR) << "run kernel failed, name: " << this->name();
return ret;
}
ret = PostProcess();
if (lite::RET_OK != ret) {
MS_LOG(ERROR) << "run kernel PreProcess failed, name: " << this->name();
return ret;
}
return lite::RET_OK;
};
int PostProcess();
private:
void FreeOriginInputData();
int Float32TensorToFloat16Tensor(lite::Tensor *tensor);
int Float16TensorToFloat32Tensor(lite::Tensor *tensor);
private:
std::map<lite::Tensor *, DataStore *> origin_input_data_;
bool support_fp16_ = false;
};
#endif

View File

@ -81,7 +81,7 @@ ml_video_edit_oneclick_adaptis.pb;3 6
ml_female_model_step6_noiseout.pb;66 2
ml_male_model_step6_noiseout.pb;66 2.5
ml_tts_encoder_control_flow.pb;4;1:1,22:1:1 1.5
#ml_tts_decoder_control_flow.pb;5 1 to open
ml_tts_decoder_control_flow.pb;5 1
#ml_tts_decoder.pb;5 2.5 to open
ml_tts_vocoder.pb;66 53
hiai_transformer_encoder.pb;15 4