forked from mindspore-Ecosystem/mindspore
!24922 MD Profiling: RecordEndOfEpoch in ProfilingManager
Merge pull request !24922 from cathwong/ckw_prof_epochend2
This commit is contained in:
commit
70cd1c77d5
|
@ -129,10 +129,11 @@ Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) {
|
|||
// An eoe row means we have iterated an epoch.
|
||||
// The next row in the pipeline might be an EOF or a TensorRow for next epoch
|
||||
if (out_row->eoe()) {
|
||||
MS_LOG(INFO) << "End of data iteration.";
|
||||
MS_LOG(INFO) << "End of data iteration. cur_batch_num_: " << cur_batch_num_;
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (is_profiling_enable) {
|
||||
root_->Tree()->SetEpochEnd();
|
||||
root_->Tree()->GetProfilingManager()->RecordEndOfEpoch(cur_batch_num_);
|
||||
}
|
||||
#endif
|
||||
return Status::OK();
|
||||
|
|
|
@ -280,6 +280,7 @@ Status DeviceQueueOp::SendDataToAscend() {
|
|||
connector_size = ChildOpConnectorSize();
|
||||
connector_capacity = ChildOpConnectorCapacity();
|
||||
tree_->SetEpochEnd();
|
||||
tree_->GetProfilingManager()->RecordEndOfEpoch(send_batch);
|
||||
}
|
||||
#endif
|
||||
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&curr_row));
|
||||
|
@ -574,8 +575,10 @@ Status DeviceQueueOp::SendDataToGPU() {
|
|||
first_fetch_flag_ = true;
|
||||
int64_t num_buf = 0;
|
||||
bool is_break_loop = false;
|
||||
uint32_t batch_num = 0;
|
||||
while (!current_row.eof() && !is_break_loop && !GpuBufferMgr::GetInstance().IsClosed()) {
|
||||
while (!current_row.eoe() && !is_break_loop && !GpuBufferMgr::GetInstance().IsClosed()) {
|
||||
batch_num++;
|
||||
RETURN_IF_NOT_OK(FilterMetadata(¤t_row));
|
||||
RETURN_IF_NOT_OK(CheckExceptions(current_row));
|
||||
#ifndef ENABLE_SECURITY
|
||||
|
@ -594,6 +597,12 @@ Status DeviceQueueOp::SendDataToGPU() {
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (current_row.eoe() && tree_->GetProfilingManager()->IsProfilingEnable()) {
|
||||
tree_->SetEpochEnd();
|
||||
tree_->GetProfilingManager()->RecordEndOfEpoch(batch_num);
|
||||
}
|
||||
#endif
|
||||
if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) {
|
||||
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(¤t_row));
|
||||
} else {
|
||||
|
|
|
@ -240,6 +240,12 @@ Status ProfilingManager::ChangeFileMode() {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
void ProfilingManager::RecordEndOfEpoch(uint32_t step_num) {
|
||||
MS_LOG(INFO) << "Record end of epoch. step_num: " << step_num;
|
||||
(void)epoch_end_ts_.emplace_back(ProfilingTime::GetCurMilliSecond());
|
||||
(void)epoch_end_step_.emplace_back(step_num);
|
||||
}
|
||||
|
||||
uint64_t ProfilingTime::GetCurMilliSecond() {
|
||||
// because cpplint does not allow using namespace
|
||||
using std::chrono::duration_cast;
|
||||
|
|
|
@ -121,6 +121,10 @@ class ProfilingManager {
|
|||
// launched. This is the master off switch, once called, it won't start profiler even if env variable says so.
|
||||
void DisableProfiling() { enabled_ = false; }
|
||||
|
||||
// Record end of epoch information
|
||||
// @param step_num - The number of steps
|
||||
void RecordEndOfEpoch(uint32_t step_num);
|
||||
|
||||
const std::unordered_map<std::string, std::shared_ptr<Sampling>> &GetSamplingNodes() { return sampling_nodes_; }
|
||||
|
||||
// Launch monitoring thread.
|
||||
|
@ -138,6 +142,9 @@ class ProfilingManager {
|
|||
|
||||
std::unordered_map<std::string, std::shared_ptr<Sampling>> sampling_nodes_;
|
||||
|
||||
std::vector<uint64_t> epoch_end_ts_; // End of epoch timestamp
|
||||
std::vector<uint32_t> epoch_end_step_; // End of epoch step number
|
||||
|
||||
// Register profile node to tree
|
||||
// @param node - Profiling node
|
||||
// @return Status The status code returned
|
||||
|
|
|
@ -235,10 +235,11 @@ Status TreeAdapter::GetNext(TensorRow *row) {
|
|||
|
||||
RETURN_IF_NOT_OK(tree_->root()->GetNextRow(row)); // first buf can't be eof or empty buf with none flag
|
||||
if (row->eoe()) { // return empty tensor if 1st buf is a ctrl buf (no rows)
|
||||
MS_LOG(INFO) << "End of data iteration.";
|
||||
MS_LOG(INFO) << "End of data iteration. cur_batch_num_: " << cur_batch_num_;
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (is_profiling_enable) {
|
||||
tree_->SetEpochEnd();
|
||||
tree_->GetProfilingManager()->RecordEndOfEpoch(cur_batch_num_);
|
||||
}
|
||||
#endif
|
||||
return Status::OK();
|
||||
|
|
Loading…
Reference in New Issue