train code check return value

This commit is contained in:
yefeng 2022-02-16 19:01:08 +08:00
parent fa3e26ce61
commit 67b4fe9cea
5 changed files with 50 additions and 14 deletions

View File

@ -35,7 +35,11 @@ void AccuracyMonitor::Begin(const session::TrainLoopCallBackData &cb_data) {
int AccuracyMonitor::EpochEnd(const session::TrainLoopCallBackData &cb_data) {
if ((static_cast<int>(cb_data.epoch_) + 1) % check_every_n_ == 0) {
cb_data.loop_->Eval(ds_, {}, nullptr, max_steps_);
auto ret = cb_data.loop_->Eval(ds_, {}, nullptr, max_steps_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Eval failed.";
return RET_ERROR;
}
}
accuracies_.push_back(std::make_pair(cb_data.epoch_, 0.0));
return mindspore::session::RET_CONTINUE;

View File

@ -117,8 +117,8 @@ class OptimizerKernel : public InnerKernel {
}
int RestoreDefaultLearningRate() {
SetLearningRate(default_lr_);
return RET_OK;
auto ret = SetLearningRate(default_lr_);
return ret;
}
int SetOptimizerMode(WeightUpdateMode mod) {
@ -139,7 +139,11 @@ class OptimizerKernel : public InnerKernel {
weight_update_mod_ = mod;
} else {
if (grad_sum_ != nullptr) {
OptimizerStep();
auto ret = OptimizerStep();
if (ret != RET_OK) {
MS_LOG(ERROR) << "OptimizerStep failed.";
return RET_ERROR;
}
ms_context_->allocator->Free(grad_sum_);
grad_sum_ = nullptr;
}
@ -169,7 +173,11 @@ class OptimizerKernel : public InnerKernel {
int Eval() override {
if (weight_update_mod_ != WeightUpdateMode::ACCUMULATE_GRADS) {
OptimizerStep();
auto ret = OptimizerStep();
if (ret != RET_OK) {
MS_LOG(ERROR) << "OptimizerStep failed.";
return RET_ERROR;
}
}
return InnerKernel::Eval();
}

View File

@ -167,7 +167,11 @@ std::unique_ptr<schema::TensorT> TrainExport::CreateTensor(const mindspore::lite
tensorT->enableHuffmanCode = false;
if ((tensorT->nodeType == NodeType_ValueNode) && (scTensor->data() != nullptr) && (scTensor->data()->size() > 0)) {
if (NeedQuantization(tensor)) {
QuantTensorData(tensorT.get(), tensor, preferred_dim);
auto ret = QuantTensorData(tensorT.get(), tensor, preferred_dim);
if (ret != RET_OK) {
MS_LOG(ERROR) << "QuantTensorData failed.";
return nullptr;
}
} else {
tensorT->data = CreateData(tensor);
}

View File

@ -498,8 +498,12 @@ int TrainSession::MixPrecisionExecKernels(const KernelCallBack &before, const Ke
float scale = cfg_.mix_precision_cfg_.loss_scale_;
for (auto *kernel : run_kernels) {
MS_ASSERT(kernel != nullptr);
MixPrecisionPreProcess(kernel, scale);
auto ret = kernel->Execute(before, after);
auto ret = MixPrecisionPreProcess(kernel, scale);
if (ret != RET_OK) {
MS_LOG(ERROR) << "MixPrecisionPreProcess failed.";
return RET_ERROR;
}
ret = kernel->Execute(before, after);
if (RET_OK != ret) {
MixPrecisionPostProcess(kernel);
// decrease loss scale in case of nan or inf
@ -758,7 +762,11 @@ void TrainSession::CompileOptimizedKernels() {
std::copy(kernel->in_tensors().begin(), kernel->in_tensors().end(), std::back_inserter(out_tensor));
if (cfg_.accumulate_gradients_) {
auto optimizer = static_cast<kernel::OptimizerKernel *>(kernel->kernel());
optimizer->SetOptimizerMode(kernel::WeightUpdateMode::ACCUMULATE_GRADS);
auto ret = optimizer->SetOptimizerMode(kernel::WeightUpdateMode::ACCUMULATE_GRADS);
if (ret != RET_OK) {
MS_LOG(ERROR) << "SetOptimizerMode failed.";
return;
}
}
}
}
@ -832,7 +840,7 @@ int TrainSession::SetOptimizerParams(const std::vector<tensor::MSTensor *> &para
for (size_t ix = 0; ix < params.size(); ix++) {
auto param = params[ix];
if (param == nullptr) {
MS_LOG(ERROR) << "Param tensor " << param->tensor_name() << " is null.";
MS_LOG(ERROR) << "Param tensor is null.";
return RET_ERROR;
}
bool found = false;
@ -876,7 +884,7 @@ int TrainSession::ApplyGradients(const std::vector<tensor::MSTensor *> &gradient
for (size_t ix = 0; ix < gradients.size(); ix++) {
auto gradient = gradients[ix];
if (gradient == nullptr) {
MS_LOG(ERROR) << "gradient tensor " << gradient->tensor_name() << " is null.";
MS_LOG(ERROR) << "gradient tensor is null.";
return RET_ERROR;
}
bool found = false;
@ -904,7 +912,10 @@ int TrainSession::ApplyGradients(const std::vector<tensor::MSTensor *> &gradient
for (auto kernel : this->train_kernels_) {
if (IsOptimizer(kernel)) {
auto optimizer = static_cast<kernel::OptimizerKernel *>(kernel->kernel());
optimizer->set_grad_sum_valid();
if (optimizer->set_grad_sum_valid() != RET_OK) {
MS_LOG(ERROR) << "set grad sum valid failed.";
return RET_ERROR;
}
auto ret = optimizer->OptimizerStep();
if (ret != RET_OK) {
MS_LOG(ERROR) << "failed to optimize model weights";

View File

@ -195,7 +195,10 @@ int TransferSession::Export(const std::string &filename, ModelType model_type, Q
}
bool orig_train_state = IsTrain();
Eval();
if (Eval() != RET_OK) {
MS_LOG(ERROR) << "eval failed.";
return RET_ERROR;
}
TrainExport texport(filename);
int status = texport.LoadModel(lite_model_, size_backbone_);
if (status != RET_OK) {
@ -231,7 +234,13 @@ int TransferSession::Export(const std::string &filename, ModelType model_type, Q
MS_LOG(ERROR) << "failed to save to " << filename;
return status;
}
if (orig_train_state) Train();
if (orig_train_state) {
auto ret = Train();
if (ret != RET_OK) {
MS_LOG(ERROR) << "train failed.";
return RET_ERROR;
}
}
return status;
}