Add statistic dump for ascend
This commit is contained in:
parent
811d7128c7
commit
b9d1a4920c
|
@ -382,9 +382,9 @@ void DumpJsonParser::ParseSavedData(const nlohmann::json &content) {
|
|||
<< saved_data_ << ". Please set saved_data to either statistic, tensor, or full";
|
||||
}
|
||||
auto context = MsContext::GetInstance();
|
||||
if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kGPUDevice) {
|
||||
MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU device, please set "
|
||||
"saved_data to tensor or use a GPU device";
|
||||
if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
|
||||
MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU and Ascend, please "
|
||||
"set saved_data to tensor or use a GPU or Ascend device";
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <unistd.h>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
@ -511,7 +512,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
|
|||
for (uint32_t slot = 0; slot < input_tensors.size(); slot++) {
|
||||
auto in_tensor = input_tensors[slot];
|
||||
std::string in_slot_path = in_path + std::to_string(slot) + ".";
|
||||
auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset);
|
||||
auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset, "input", slot);
|
||||
if (!succ) {
|
||||
MS_LOG(INFO) << "Failed to convert format for tensor " << in_slot_path;
|
||||
}
|
||||
|
@ -524,7 +525,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
|
|||
for (uint32_t slot = 0; slot < output_tensors.size(); slot++) {
|
||||
auto out_tensor = output_tensors[slot];
|
||||
std::string out_slot_path = out_path + std::to_string(slot) + ".";
|
||||
auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset);
|
||||
auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset, "output", slot);
|
||||
if (!succ) {
|
||||
MS_LOG(INFO) << "Failed to convert format for tensor " << out_slot_path;
|
||||
}
|
||||
|
@ -533,7 +534,40 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr) {
|
||||
bool DumpTensorStatsIfNeeded(const std::string &dump_path, const T &tensor, char *data_ptr, const std::string &io,
|
||||
uint32_t slot, const ShapeVector &shape, TypeId type) {
|
||||
if (!DumpJsonParser::GetInstance().IsStatisticDump()) {
|
||||
return true;
|
||||
}
|
||||
size_t pos = dump_path.rfind("/");
|
||||
std::string file_name = dump_path.substr(pos + 1);
|
||||
size_t first_dot = file_name.find(".");
|
||||
size_t second_dot = file_name.find(".", first_dot + 1);
|
||||
size_t third_dot = file_name.find(".", second_dot + 1);
|
||||
size_t fourth_dot = file_name.find(".", third_dot + 1);
|
||||
size_t fifth_dot = file_name.find(".", fourth_dot + 1);
|
||||
std::string op_type = file_name.substr(0, first_dot);
|
||||
std::string op_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
|
||||
std::string task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
|
||||
std::string stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
|
||||
std::string timestamp = file_name.substr(fourth_dot + 1, fifth_dot - fourth_dot - 1);
|
||||
TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, io, slot, slot);
|
||||
std::shared_ptr<TensorData> data = std::make_shared<TensorData>();
|
||||
try {
|
||||
data->ConvertMsToDbgType(type);
|
||||
} catch (...) {
|
||||
MS_LOG(ERROR) << "Data type of operator " << file_name << " is not supported by statistic dump";
|
||||
return false;
|
||||
}
|
||||
data->SetByteSize((size_t)tensor.size());
|
||||
data->SetShape(shape);
|
||||
data->SetDataPtr(data_ptr);
|
||||
return stat_dump.DumpTensorStatsToFile(dump_path.substr(0, pos), data);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr,
|
||||
const std::string &io, uint32_t slot) {
|
||||
// get format
|
||||
auto iter_fmt = kFormatToStringMap.find(tensor.format());
|
||||
if (iter_fmt == kFormatToStringMap.end()) {
|
||||
|
@ -584,13 +618,21 @@ bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tens
|
|||
}
|
||||
}
|
||||
// dump tensor data into npy file
|
||||
bool dump_success = false;
|
||||
bool dump_success = true;
|
||||
if (trans_success) {
|
||||
dump_path += host_format;
|
||||
dump_success = DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type);
|
||||
dump_success = DumpTensorStatsIfNeeded(dump_path, tensor, reinterpret_cast<char *>(trans_buf.data()), io, slot,
|
||||
shape_to, src_type);
|
||||
if (DumpJsonParser::GetInstance().IsTensorDump()) {
|
||||
dump_path += host_format;
|
||||
dump_success =
|
||||
DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type) && dump_success;
|
||||
}
|
||||
} else {
|
||||
dump_path += device_format;
|
||||
dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type);
|
||||
dump_success = DumpTensorStatsIfNeeded(dump_path, tensor, data_ptr, io, slot, shape_to, src_type);
|
||||
if (DumpJsonParser::GetInstance().IsTensorDump()) {
|
||||
dump_path += device_format;
|
||||
dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type) && dump_success;
|
||||
}
|
||||
}
|
||||
return dump_success;
|
||||
}
|
||||
|
|
|
@ -95,7 +95,8 @@ class E2eDump {
|
|||
static nlohmann::json ParseOverflowInfo(char *data_ptr);
|
||||
|
||||
template <typename T>
|
||||
static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr);
|
||||
static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr,
|
||||
const std::string &io, uint32_t slot);
|
||||
#endif
|
||||
|
||||
inline static unsigned int starting_graph_id = INT32_MAX;
|
||||
|
|
|
@ -41,7 +41,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
|
|||
}
|
||||
auto file_path = Common::CreatePrefixPath(path);
|
||||
if (!file_path.has_value()) {
|
||||
MS_LOG(WARNING) << "CreatePrefixPath failed.";
|
||||
MS_LOG(WARNING) << "CreatePrefixPath failed, skipping current statistics";
|
||||
return false;
|
||||
}
|
||||
// try to open file
|
||||
|
@ -55,7 +55,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
|
|||
file_.open(file_path_value, std::ios::out | std::ios::app | std::ios::binary);
|
||||
}
|
||||
if (!file_.is_open()) {
|
||||
MS_LOG(WARNING) << "Open file " << path << " failed." << ErrnoToString(errno);
|
||||
MS_LOG(WARNING) << "Open file " << file_path_value << " failed." << ErrnoToString(errno);
|
||||
return false;
|
||||
}
|
||||
if (first_time_opening) {
|
||||
|
@ -63,7 +63,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
|
|||
file_.flush();
|
||||
file_path_str_ = path;
|
||||
}
|
||||
MS_LOG(INFO) << "Opened file: " << path;
|
||||
MS_LOG(INFO) << "Opened file: " << file_path_value;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -93,9 +93,9 @@ TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op
|
|||
size_t tensor_loader_slot)
|
||||
: op_type_{op_type},
|
||||
op_name_{op_name},
|
||||
task_id_{task_id},
|
||||
stream_id_{stream_id},
|
||||
timestamp_{timestamp},
|
||||
task_id_{std::to_string(task_id)},
|
||||
stream_id_{std::to_string(stream_id)},
|
||||
timestamp_{std::to_string(timestamp)},
|
||||
slot_{slot},
|
||||
tensor_loader_slot_{tensor_loader_slot} {
|
||||
if (input) {
|
||||
|
@ -105,6 +105,22 @@ TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op
|
|||
}
|
||||
}
|
||||
|
||||
TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op_name, const std::string &task_id,
|
||||
const std::string &stream_id, const std::string ×tamp, const std::string &io,
|
||||
size_t slot, size_t tensor_loader_slot)
|
||||
: op_type_{op_type},
|
||||
op_name_{op_name},
|
||||
task_id_{task_id},
|
||||
stream_id_{stream_id},
|
||||
timestamp_{timestamp},
|
||||
io_{io},
|
||||
slot_{slot},
|
||||
tensor_loader_slot_{tensor_loader_slot} {
|
||||
if (io_ != kInput && io_ != kOutput) {
|
||||
MS_LOG(EXCEPTION) << "Cannot instantiate TensorStatDump, io needs to be either " << kInput << " or " << kOutput;
|
||||
}
|
||||
}
|
||||
|
||||
bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) {
|
||||
std::string filename = dump_path + "/" + kCsvFileName;
|
||||
// try to open file
|
||||
|
@ -125,16 +141,24 @@ bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) {
|
|||
|
||||
bool TensorStatDump::DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path,
|
||||
const Debugger *debugger) {
|
||||
if (!OpenStatisticsFile(dump_path)) {
|
||||
return false;
|
||||
}
|
||||
// get tensor statistics using debugger
|
||||
// get tensor data using debugger
|
||||
std::string tensor_loader_name = original_kernel_name + ":" + std::to_string(tensor_loader_slot_);
|
||||
std::shared_ptr<TensorData> data = debugger->GetTensor(tensor_loader_name);
|
||||
if (data == nullptr) {
|
||||
MS_LOG(WARNING) << "Failed to find " << tensor_loader_name << " in tensor loader, skipping current statistics";
|
||||
return false;
|
||||
}
|
||||
return DumpTensorStatsToFile(dump_path, data);
|
||||
}
|
||||
|
||||
bool TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, std::shared_ptr<TensorData> data) {
|
||||
if (data == nullptr) {
|
||||
MS_LOG(WARNING) << "Tensor data is empty, skipping current statistics";
|
||||
return false;
|
||||
}
|
||||
if (!OpenStatisticsFile(dump_path)) {
|
||||
return false;
|
||||
}
|
||||
const DebugServices::TensorStat &stat = DebugServices::GetTensorStatistics(data);
|
||||
// write tensor statistics to csv file
|
||||
std::ostringstream shape;
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
|
||||
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
|
||||
|
@ -24,6 +25,7 @@
|
|||
|
||||
namespace mindspore {
|
||||
class Debugger;
|
||||
class TensorData;
|
||||
class CsvWriter {
|
||||
public:
|
||||
static CsvWriter &GetInstance() {
|
||||
|
@ -31,13 +33,6 @@ class CsvWriter {
|
|||
return instance;
|
||||
}
|
||||
|
||||
private:
|
||||
const std::string kSeparator = ",";
|
||||
const std::string kEndLine = "\n";
|
||||
std::ofstream file_;
|
||||
std::string file_path_str_ = "";
|
||||
|
||||
public:
|
||||
CsvWriter() = default;
|
||||
~CsvWriter();
|
||||
DISABLE_COPY_AND_ASSIGN(CsvWriter)
|
||||
|
@ -45,28 +40,39 @@ class CsvWriter {
|
|||
void CloseFile();
|
||||
template <typename T>
|
||||
void WriteToCsv(const T &val, bool end_line = false);
|
||||
|
||||
private:
|
||||
const std::string kSeparator = ",";
|
||||
const std::string kEndLine = "\n";
|
||||
std::ofstream file_;
|
||||
std::string file_path_str_ = "";
|
||||
};
|
||||
|
||||
class TensorStatDump {
|
||||
static const char CSV_HEADER[];
|
||||
static const char CSV_FILE_NAME[];
|
||||
|
||||
const std::string &op_type_;
|
||||
const std::string &op_name_;
|
||||
uint32_t task_id_;
|
||||
uint32_t stream_id_;
|
||||
uint64_t timestamp_;
|
||||
std::string io_;
|
||||
size_t slot_;
|
||||
size_t tensor_loader_slot_;
|
||||
|
||||
public:
|
||||
static bool OpenStatisticsFile(const std::string &dump_path);
|
||||
|
||||
TensorStatDump(const std::string &op_type, const std::string &op_name, uint32_t task_id, uint32_t stream_id,
|
||||
uint64_t timestamp, bool input, size_t slot, size_t tensor_loader_slot_);
|
||||
TensorStatDump(const std::string &op_type, const std::string &op_name, const std::string &task_id,
|
||||
const std::string &stream_id, const std::string ×tamp, const std::string &io, size_t slot,
|
||||
size_t tensor_loader_slot);
|
||||
bool DumpTensorStatsToFile(const std::string &dump_path, std::shared_ptr<TensorData> data);
|
||||
bool DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path,
|
||||
const Debugger *debugger);
|
||||
|
||||
private:
|
||||
static const char CSV_HEADER[];
|
||||
static const char CSV_FILE_NAME[];
|
||||
|
||||
const std::string op_type_;
|
||||
const std::string op_name_;
|
||||
const std::string task_id_;
|
||||
const std::string stream_id_;
|
||||
const std::string timestamp_;
|
||||
std::string io_;
|
||||
size_t slot_;
|
||||
size_t tensor_loader_slot_;
|
||||
};
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
|
||||
|
|
|
@ -158,11 +158,15 @@ def generate_statistic_dump_json(dump_path, json_file_name, test_key, saved_data
|
|||
"""
|
||||
if test_key == "test_gpu_e2e_dump":
|
||||
data = e2e_dump_dict
|
||||
data["common_dump_settings"]["path"] = dump_path
|
||||
data["common_dump_settings"]["saved_data"] = saved_data
|
||||
elif test_key == "test_async_dump":
|
||||
data = async_dump_dict
|
||||
data["common_dump_settings"]["input_output"] = 0
|
||||
data["common_dump_settings"]["file_format"] = "npy"
|
||||
else:
|
||||
raise ValueError(
|
||||
"Failed to generate statistic dump json file. The test name value " + test_key + " is invalid.")
|
||||
data["common_dump_settings"]["path"] = dump_path
|
||||
data["common_dump_settings"]["saved_data"] = saved_data
|
||||
with open(json_file_name, 'w') as f:
|
||||
json.dump(data, f)
|
||||
|
||||
|
|
|
@ -409,36 +409,37 @@ def check_statistic_dump(dump_file_path):
|
|||
real_path = os.path.realpath(output_path)
|
||||
with open(real_path) as f:
|
||||
reader = csv.DictReader(f)
|
||||
input1 = next(reader)
|
||||
stats = list(reader)
|
||||
input1 = stats[0]
|
||||
assert input1['IO'] == 'input'
|
||||
assert input1['Min Value'] == '1'
|
||||
assert input1['Max Value'] == '6'
|
||||
input2 = next(reader)
|
||||
input2 = stats[1]
|
||||
assert input2['IO'] == 'input'
|
||||
assert input2['Min Value'] == '7'
|
||||
assert input2['Max Value'] == '12'
|
||||
output = next(reader)
|
||||
output = stats[2]
|
||||
assert output['IO'] == 'output'
|
||||
assert output['Min Value'] == '8'
|
||||
assert output['Max Value'] == '18'
|
||||
|
||||
def check_data_dump(dump_file_path):
|
||||
output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
|
||||
output_name = "Add.Add-op*.output.0.*.npy"
|
||||
output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
|
||||
real_path = os.path.realpath(output_path)
|
||||
output = np.load(real_path)
|
||||
expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
|
||||
assert np.array_equal(output, expect)
|
||||
|
||||
def run_gpu_e2e_dump(saved_data):
|
||||
"""Run gpu e2e dump"""
|
||||
def run_saved_data_dump_test(scenario, saved_data):
|
||||
"""Run e2e dump on scenario, testing statistic dump"""
|
||||
if sys.platform != 'linux':
|
||||
return
|
||||
pwd = os.getcwd()
|
||||
with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
|
||||
dump_path = os.path.join(tmp_dir, 'gpu_e2e_dump')
|
||||
dump_config_path = os.path.join(tmp_dir, 'gpu_e2e_dump.json')
|
||||
generate_statistic_dump_json(dump_path, dump_config_path, 'test_gpu_e2e_dump', saved_data)
|
||||
dump_path = os.path.join(tmp_dir, 'test_saved_data')
|
||||
dump_config_path = os.path.join(tmp_dir, 'test_saved_data.json')
|
||||
generate_statistic_dump_json(dump_path, dump_config_path, scenario, saved_data)
|
||||
os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
|
||||
dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
|
||||
if os.path.isdir(dump_path):
|
||||
|
@ -473,7 +474,7 @@ def test_gpu_e2e_statistic_dump():
|
|||
Expectation: Statistics are stored in statistic.csv files
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
|
||||
run_gpu_e2e_dump('statistic')
|
||||
run_saved_data_dump_test('test_gpu_e2e_dump', 'statistic')
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
|
@ -486,7 +487,7 @@ def test_gpu_e2e_tensor_dump():
|
|||
Expectation: Tensor data are stored in npy files
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
|
||||
run_gpu_e2e_dump('tensor')
|
||||
run_saved_data_dump_test('test_gpu_e2e_dump', 'tensor')
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
|
@ -499,4 +500,46 @@ def test_gpu_e2e_full_dump():
|
|||
Expectation: Tensor are stored in npy files and their statistics stored in statistic.csv
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
|
||||
run_gpu_e2e_dump('full')
|
||||
run_saved_data_dump_test('test_gpu_e2e_dump', 'full')
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_ascend_statistic_dump():
|
||||
"""
|
||||
Feature: Ascend Statistics Dump
|
||||
Description: Test Ascend statistics dump
|
||||
Expectation: Statistics are stored in statistic.csv files
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
|
||||
run_saved_data_dump_test('test_async_dump', 'statistic')
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_ascend_tensor_dump():
|
||||
"""
|
||||
Feature: Ascend Tensor Dump
|
||||
Description: Test Ascend tensor dump
|
||||
Expectation: Tensors are stored in npy files
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
|
||||
run_saved_data_dump_test('test_async_dump', 'tensor')
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_ascend_full_dump():
|
||||
"""
|
||||
Feature: Ascend Full Dump
|
||||
Description: Test Ascend full dump
|
||||
Expectation: Tensors are stored in npy files and their statistics stored in statistic.csv
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
|
||||
run_saved_data_dump_test('test_async_dump', 'full')
|
||||
|
|
Loading…
Reference in New Issue