forked from mindspore-Ecosystem/mindspore
!6568 fix the bug for sending suspend command
Merge pull request !6568 from yelihua/temp-dev
This commit is contained in:
commit
5ae77f2d51
|
@ -371,7 +371,14 @@ void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_gra
|
|||
}
|
||||
|
||||
void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||
// check the dump_enabled and dataset_sink_mode
|
||||
bool dump_enabled = DumpDataEnabledIteration();
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
if (dump_enabled && context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) {
|
||||
MS_EXCEPTION(NotSupportError) << "Don't support set dataset_sink_mode to True when using e2e_dump";
|
||||
}
|
||||
|
||||
if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -283,12 +283,15 @@ void Debugger::PostExecuteNode() {
|
|||
auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table);
|
||||
|
||||
// if kernel is watchpoint,and get hit. suspend.
|
||||
bool hit_empty_flag = true;
|
||||
if (is_watchpoint) {
|
||||
auto hits = CheckWatchpoints(cur_name_);
|
||||
if (!hits.empty()) {
|
||||
SendWatchpointsAndSuspend(hits);
|
||||
hit_empty_flag = false;
|
||||
}
|
||||
} else if (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
|
||||
}
|
||||
if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
|
||||
// if kernel is not watchpoint and is next_to or continue_to node, suspend
|
||||
CommandLoop();
|
||||
}
|
||||
|
@ -405,7 +408,9 @@ void Debugger::CommandLoop() {
|
|||
MS_LOG(ERROR) << "Error: WaitForCommand failed";
|
||||
num_wait_fail++;
|
||||
if (num_wait_fail > max_num_wait_fail) {
|
||||
MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session";
|
||||
MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session.";
|
||||
MS_LOG(ERROR) << "Failed to connect to MindInsight debugger server. Please check the config "
|
||||
"of debugger host and port.";
|
||||
Exit();
|
||||
}
|
||||
MS_LOG(ERROR) << "Number of consecutive WaitForCommand fail:" << num_wait_fail << "; Retry after "
|
||||
|
@ -417,11 +422,11 @@ void Debugger::CommandLoop() {
|
|||
// get type of the command in reply
|
||||
DebuggerCommand cmd = GetCommand(reply);
|
||||
if (cmd == DebuggerCommand::kUnknownCMD) {
|
||||
MS_LOG(DEBUG) << "Debug: debugger recieved unknown command";
|
||||
MS_LOG(DEBUG) << "Debug: debugger received unknown command";
|
||||
continue;
|
||||
}
|
||||
|
||||
MS_LOG(INFO) << "recieved command: ";
|
||||
MS_LOG(INFO) << "received command: ";
|
||||
switch (cmd) {
|
||||
case DebuggerCommand::kUnknownCMD:
|
||||
MS_LOG(INFO) << "UnknownCMD";
|
||||
|
|
|
@ -13,11 +13,12 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "profiler/device/gpu/data_saver.h"
|
||||
#include <fstream>
|
||||
#include <numeric>
|
||||
#include "sys/stat.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#include "utils/ms_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace profiler {
|
||||
|
@ -183,6 +184,7 @@ void DataSaver::WriteOpType(const std::string &saver_base_dir) {
|
|||
ofs << op_type_info.second << std::endl;
|
||||
}
|
||||
ofs.close();
|
||||
ChangeFileMode(file_path);
|
||||
MS_LOG(INFO) << "Write " << op_type_infos_.size() << " op type infos into file: " << file_path;
|
||||
}
|
||||
|
||||
|
@ -199,6 +201,7 @@ void DataSaver::WriteOpDetail(const std::string &saver_base_dir) {
|
|||
ofs << op_detail << std::endl;
|
||||
}
|
||||
ofs.close();
|
||||
ChangeFileMode(file_path);
|
||||
MS_LOG(INFO) << "Write " << op_detail_infos_.size() << " op detail infos into file: " << file_path;
|
||||
}
|
||||
|
||||
|
@ -232,7 +235,9 @@ void DataSaver::WriteActivity(const std::string &saver_base_dir) {
|
|||
}
|
||||
}
|
||||
ofs.close();
|
||||
ChangeFileMode(file_path);
|
||||
activity_timestamp_ofs.close();
|
||||
ChangeFileMode(timestamp_file_path);
|
||||
MS_LOG(INFO) << "Write " << device_info.second.size() << " activity infos into file: " << file_path;
|
||||
}
|
||||
}
|
||||
|
@ -254,6 +259,14 @@ void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) {
|
|||
ofs << std::endl;
|
||||
}
|
||||
ofs.close();
|
||||
ChangeFileMode(file_path);
|
||||
}
|
||||
|
||||
void DataSaver::ChangeFileMode(const std::string &file_path) {
|
||||
if (chmod(common::SafeCStr(file_path), S_IRUSR | S_IWUSR) == -1) {
|
||||
MS_LOG(INFO) << "Modify file:" << file_path << " to rw fail.";
|
||||
return;
|
||||
}
|
||||
}
|
||||
} // namespace gpu
|
||||
} // namespace profiler
|
||||
|
|
|
@ -145,6 +145,8 @@ class DataSaver {
|
|||
|
||||
void WriteOpTimestamp(const std::string &saver_base_dir);
|
||||
|
||||
void ChangeFileMode(const std::string &file_path);
|
||||
|
||||
std::string device_id_;
|
||||
AllActivityInfos activity_infos_;
|
||||
OpTypeInfos op_type_infos_;
|
||||
|
|
Loading…
Reference in New Issue