fix yolov3 tiny multi root graph seg fault

This commit is contained in:
John Tzanakakis 2021-08-30 16:28:47 -04:00
parent 65dabb58ef
commit 1f4e1d2be7
4 changed files with 52 additions and 23 deletions

View File

@ -84,45 +84,57 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
void *const previous_tensor_ptr, uint32_t num_elements,
int tensor_dtype) {
uint32_t prev_num_elements, int tensor_dtype) {
switch (tensor_dtype) {
case DbgDataType::DT_UINT8: {
return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_INT8: {
return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_UINT16: {
return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_INT16: {
return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_UINT32: {
return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_INT32:
case DbgDataType::DT_BASE_INT: {
return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_UINT64: {
return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_INT64: {
return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_FLOAT16: {
return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_FLOAT32:
case DbgDataType::DT_BASE_FLOAT: {
return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_FLOAT64: {
return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
case DbgDataType::DT_BOOL: {
return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
prev_num_elements);
}
default:
MS_LOG(INFO) << "Unsupported tensor type";
@ -132,7 +144,8 @@ std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData>
}
#ifdef OFFLINE_DBG_MODE
void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed) {
void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
uint32_t *prev_num_elements) {
void *previous_tensor_ptr = nullptr;
std::shared_ptr<TensorData> tensor_prev;
if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
@ -155,6 +168,7 @@ void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bo
tensor_prev.reset();
} else {
previous_tensor_ptr = tensor_prev->GetDataPtr();
*prev_num_elements = tensor_prev->GetNumElements();
}
}
return previous_tensor_ptr;
@ -247,16 +261,21 @@ void DebugServices::CheckWatchpointsForTensor(
// no wp set on current tensor
if (watchpoints_to_check.empty()) continue;
uint32_t num_elements = tensor->GetNumElements();
uint32_t prev_num_elements = 0;
void *previous_tensor_ptr = nullptr;
#ifdef OFFLINE_DBG_MODE
void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed);
previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements);
#else
void *previous_tensor_ptr =
tensor_loader_->GetPrevTensor(tensor_name) ? tensor_loader_->GetPrevTensor(tensor_name)->GetDataPtr() : nullptr;
std::shared_ptr<TensorData> prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
if (prev_tensor_data) {
previous_tensor_ptr = prev_tensor_data->GetDataPtr();
prev_num_elements = prev_tensor_data->GetNumElements();
}
#endif
std::unique_ptr<ITensorSummary> base_summary_ptr;
if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, tensor_dtype);
base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
if (base_summary_ptr != nullptr) {
base_summary_ptr->SummarizeTensor(watchpoints_to_check);
}

View File

@ -238,7 +238,8 @@ class DebugServices {
std::vector<std::shared_ptr<TensorData>> ReadNeededDumpedTensors(unsigned int iteration,
std::vector<std::string> *async_file_pool);
void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed);
void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
uint32_t *prev_num_elements);
void ReadTensorFromNpy(const std::string &file_name, std::string *tensor_type, std::size_t *size,
std::vector<int64_t> *shape, std::vector<char> **data_buffer);

View File

@ -93,10 +93,12 @@ double VarianceAndMeanCalculator::GetVariance() const {
double VarianceAndMeanCalculator::GetStandardDeviation() { return sqrt(GetVariance()); }
template <typename T>
TensorSummary<T>::TensorSummary(void *current_tensor_ptr, void *const previous_tensor_ptr, uint32_t num_elements)
TensorSummary<T>::TensorSummary(void *current_tensor_ptr, void *const previous_tensor_ptr, uint32_t num_elements,
uint32_t prev_num_elements)
: current_tensor_ptr(reinterpret_cast<T *>(current_tensor_ptr)),
prev_tensor_ptr(reinterpret_cast<T *>(previous_tensor_ptr)),
num_elements(num_elements),
prev_num_elements_(prev_num_elements),
min(std::numeric_limits<double>::max()),
max(std::numeric_limits<double>::lowest()),
inf_count(0),
@ -110,8 +112,14 @@ void TensorSummary<T>::SummarizeTensor(const std::vector<DebugServices::watchpoi
InitCalculators(wps);
for (size_t i = 0; i < num_elements; ++i) {
auto current_value = static_cast<double>(current_tensor_ptr[i]);
double previous_value =
prev_tensor_ptr ? static_cast<double>(prev_tensor_ptr[i]) : std::numeric_limits<double>::quiet_NaN();
double previous_value = std::numeric_limits<double>::quiet_NaN();
if (prev_tensor_ptr) {
if (num_elements == prev_num_elements_) {
previous_value = static_cast<double>(prev_tensor_ptr[i]);
} else {
MS_LOG(DEBUG) << "Current and previous tensor are not the same size.";
}
}
inf_count += std::isinf(current_value);
nan_count += std::isnan(current_value);
zero_count += (current_value == 0);

View File

@ -99,7 +99,7 @@ class TensorSummary : public ITensorSummary {
public:
TensorSummary() = default;
~TensorSummary() override = default;
TensorSummary(void *, void *, uint32_t);
TensorSummary(void *, void *, uint32_t, uint32_t);
void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) override;
// returns hit, error_code, parameter_list
std::tuple<bool, int, std::vector<DebugServices::parameter_t>> IsWatchpointHit(DebugServices::watchpoint_t) override;
@ -108,6 +108,7 @@ class TensorSummary : public ITensorSummary {
T *current_tensor_ptr;
T *prev_tensor_ptr;
uint32_t num_elements;
uint32_t prev_num_elements_;
double min;
double max;
uint32_t inf_count;