!16564 adjust default monitoring sampling interval and change output save pattern

From: @ziruiwu
Reviewed-by: 
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2021-05-27 02:02:45 +08:00 committed by Gitee
commit 878fbdaf61
2 changed files with 11 additions and 2 deletions

View File

@ -35,14 +35,23 @@ Status Monitor::operator()() {
// Keep sampling if
// 1) Monitor Task is not interrupted by TaskManager AND
// 2) Iterator has not received EOF
// this will trigger a save on 2min, 4min, 8min, 16min ... mark on top of the save per_epoch
// The idea is whenever training is interrupted, you will get at least half of the sampling data during training
int64_t save_interval = 1 + (120 * 1000 / sampling_interval_);
int64_t loop_cnt = 1;
while (!this_thread::is_interrupted() && !(tree_->isFinished()) && !(cfg->stop_profiler_status())) {
if (tree_->IsEpochEnd()) {
RETURN_IF_NOT_OK(tree_->GetProfilingManager()->SaveProfilingData());
tree_->SetExecuting();
} else if (loop_cnt % save_interval == 0) {
RETURN_IF_NOT_OK(tree_->GetProfilingManager()->SaveProfilingData());
}
for (auto &node : tree_->GetProfilingManager()->GetSamplingNodes()) {
RETURN_IF_NOT_OK(node.second->Sample());
}
if (loop_cnt % save_interval == 0) save_interval *= 2;
loop_cnt += 1;
std::this_thread::sleep_for(std::chrono::milliseconds(sampling_interval_));
}

View File

@ -102,8 +102,8 @@ constexpr uint32_t kCfgWorkerConnectorSize = 16;
constexpr uint32_t kCfgOpConnectorSize = 16;
constexpr int32_t kCfgDefaultRankId = -1;
constexpr uint32_t kCfgDefaultSeed = std::mt19937::default_seed;
constexpr uint32_t kCfgMonitorSamplingInterval = 10;
constexpr uint32_t kCfgCallbackTimeout = 60; // timeout value for callback in seconds
constexpr uint32_t kCfgMonitorSamplingInterval = 1000; // timeout value for sampling interval in milliseconds
constexpr uint32_t kCfgCallbackTimeout = 60; // timeout value for callback in seconds
constexpr int32_t kCfgDefaultCachePort = 50052;
constexpr char kCfgDefaultCacheHost[] = "127.0.0.1";
constexpr int32_t kDftPrefetchSize = 20;