!24922 MD Profiling: RecordEndOfEpoch in ProfilingManager

Merge pull request !24922 from cathwong/ckw_prof_epochend2
This commit is contained in:
i-robot 2021-10-15 18:37:28 +00:00 committed by Gitee
commit 70cd1c77d5
5 changed files with 26 additions and 2 deletions

View File

@ -129,10 +129,11 @@ Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) {
// An eoe row means we have iterated an epoch.
// The next row in the pipeline might be an EOF or a TensorRow for next epoch
if (out_row->eoe()) {
MS_LOG(INFO) << "End of data iteration.";
MS_LOG(INFO) << "End of data iteration. cur_batch_num_: " << cur_batch_num_;
#ifndef ENABLE_SECURITY
if (is_profiling_enable) {
root_->Tree()->SetEpochEnd();
root_->Tree()->GetProfilingManager()->RecordEndOfEpoch(cur_batch_num_);
}
#endif
return Status::OK();

View File

@ -280,6 +280,7 @@ Status DeviceQueueOp::SendDataToAscend() {
connector_size = ChildOpConnectorSize();
connector_capacity = ChildOpConnectorCapacity();
tree_->SetEpochEnd();
tree_->GetProfilingManager()->RecordEndOfEpoch(send_batch);
}
#endif
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&curr_row));
@ -574,8 +575,10 @@ Status DeviceQueueOp::SendDataToGPU() {
first_fetch_flag_ = true;
int64_t num_buf = 0;
bool is_break_loop = false;
uint32_t batch_num = 0;
while (!current_row.eof() && !is_break_loop && !GpuBufferMgr::GetInstance().IsClosed()) {
while (!current_row.eoe() && !is_break_loop && !GpuBufferMgr::GetInstance().IsClosed()) {
batch_num++;
RETURN_IF_NOT_OK(FilterMetadata(&current_row));
RETURN_IF_NOT_OK(CheckExceptions(current_row));
#ifndef ENABLE_SECURITY
@ -594,6 +597,12 @@ Status DeviceQueueOp::SendDataToGPU() {
}
}
#ifndef ENABLE_SECURITY
if (current_row.eoe() && tree_->GetProfilingManager()->IsProfilingEnable()) {
tree_->SetEpochEnd();
tree_->GetProfilingManager()->RecordEndOfEpoch(batch_num);
}
#endif
if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) {
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&current_row));
} else {

View File

@ -240,6 +240,12 @@ Status ProfilingManager::ChangeFileMode() {
return Status::OK();
}
void ProfilingManager::RecordEndOfEpoch(uint32_t step_num) {
MS_LOG(INFO) << "Record end of epoch. step_num: " << step_num;
(void)epoch_end_ts_.emplace_back(ProfilingTime::GetCurMilliSecond());
(void)epoch_end_step_.emplace_back(step_num);
}
uint64_t ProfilingTime::GetCurMilliSecond() {
// because cpplint does not allow using namespace
using std::chrono::duration_cast;

View File

@ -121,6 +121,10 @@ class ProfilingManager {
// launched. This is the master off switch, once called, it won't start profiler even if env variable says so.
void DisableProfiling() { enabled_ = false; }
// Record end of epoch information
// @param step_num - The number of steps
void RecordEndOfEpoch(uint32_t step_num);
const std::unordered_map<std::string, std::shared_ptr<Sampling>> &GetSamplingNodes() { return sampling_nodes_; }
// Launch monitoring thread.
@ -138,6 +142,9 @@ class ProfilingManager {
std::unordered_map<std::string, std::shared_ptr<Sampling>> sampling_nodes_;
std::vector<uint64_t> epoch_end_ts_; // End of epoch timestamp
std::vector<uint32_t> epoch_end_step_; // End of epoch step number
// Register profile node to tree
// @param node - Profiling node
// @return Status The status code returned

View File

@ -235,10 +235,11 @@ Status TreeAdapter::GetNext(TensorRow *row) {
RETURN_IF_NOT_OK(tree_->root()->GetNextRow(row)); // first buf can't be eof or empty buf with none flag
if (row->eoe()) { // return empty tensor if 1st buf is a ctrl buf (no rows)
MS_LOG(INFO) << "End of data iteration.";
MS_LOG(INFO) << "End of data iteration. cur_batch_num_: " << cur_batch_num_;
#ifndef ENABLE_SECURITY
if (is_profiling_enable) {
tree_->SetEpochEnd();
tree_->GetProfilingManager()->RecordEndOfEpoch(cur_batch_num_);
}
#endif
return Status::OK();