Add statistic dump for ascend

This commit is contained in:
Jimmy Qi 2021-12-02 16:44:33 +00:00
parent 811d7128c7
commit b9d1a4920c
7 changed files with 175 additions and 55 deletions

View File

@ -382,9 +382,9 @@ void DumpJsonParser::ParseSavedData(const nlohmann::json &content) {
<< saved_data_ << ". Please set saved_data to either statistic, tensor, or full";
}
auto context = MsContext::GetInstance();
if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kGPUDevice) {
MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU device, please set "
"saved_data to tensor or use a GPU device";
if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU and Ascend, please "
"set saved_data to tensor or use a GPU or Ascend device";
}
}

View File

@ -19,6 +19,7 @@
#include <unistd.h>
#include <algorithm>
#include <map>
#include <memory>
#include <set>
#include <utility>
#include <vector>
@ -511,7 +512,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
for (uint32_t slot = 0; slot < input_tensors.size(); slot++) {
auto in_tensor = input_tensors[slot];
std::string in_slot_path = in_path + std::to_string(slot) + ".";
auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset);
auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset, "input", slot);
if (!succ) {
MS_LOG(INFO) << "Failed to convert format for tensor " << in_slot_path;
}
@ -524,7 +525,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
for (uint32_t slot = 0; slot < output_tensors.size(); slot++) {
auto out_tensor = output_tensors[slot];
std::string out_slot_path = out_path + std::to_string(slot) + ".";
auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset);
auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset, "output", slot);
if (!succ) {
MS_LOG(INFO) << "Failed to convert format for tensor " << out_slot_path;
}
@ -533,7 +534,40 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
}
template <typename T>
bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr) {
bool DumpTensorStatsIfNeeded(const std::string &dump_path, const T &tensor, char *data_ptr, const std::string &io,
uint32_t slot, const ShapeVector &shape, TypeId type) {
if (!DumpJsonParser::GetInstance().IsStatisticDump()) {
return true;
}
size_t pos = dump_path.rfind("/");
std::string file_name = dump_path.substr(pos + 1);
size_t first_dot = file_name.find(".");
size_t second_dot = file_name.find(".", first_dot + 1);
size_t third_dot = file_name.find(".", second_dot + 1);
size_t fourth_dot = file_name.find(".", third_dot + 1);
size_t fifth_dot = file_name.find(".", fourth_dot + 1);
std::string op_type = file_name.substr(0, first_dot);
std::string op_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
std::string task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
std::string stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
std::string timestamp = file_name.substr(fourth_dot + 1, fifth_dot - fourth_dot - 1);
TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, io, slot, slot);
std::shared_ptr<TensorData> data = std::make_shared<TensorData>();
try {
data->ConvertMsToDbgType(type);
} catch (...) {
MS_LOG(ERROR) << "Data type of operator " << file_name << " is not supported by statistic dump";
return false;
}
data->SetByteSize((size_t)tensor.size());
data->SetShape(shape);
data->SetDataPtr(data_ptr);
return stat_dump.DumpTensorStatsToFile(dump_path.substr(0, pos), data);
}
template <typename T>
bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr,
const std::string &io, uint32_t slot) {
// get format
auto iter_fmt = kFormatToStringMap.find(tensor.format());
if (iter_fmt == kFormatToStringMap.end()) {
@ -584,13 +618,21 @@ bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tens
}
}
// dump tensor data into npy file
bool dump_success = false;
bool dump_success = true;
if (trans_success) {
dump_path += host_format;
dump_success = DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type);
dump_success = DumpTensorStatsIfNeeded(dump_path, tensor, reinterpret_cast<char *>(trans_buf.data()), io, slot,
shape_to, src_type);
if (DumpJsonParser::GetInstance().IsTensorDump()) {
dump_path += host_format;
dump_success =
DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type) && dump_success;
}
} else {
dump_path += device_format;
dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type);
dump_success = DumpTensorStatsIfNeeded(dump_path, tensor, data_ptr, io, slot, shape_to, src_type);
if (DumpJsonParser::GetInstance().IsTensorDump()) {
dump_path += device_format;
dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type) && dump_success;
}
}
return dump_success;
}

View File

@ -95,7 +95,8 @@ class E2eDump {
static nlohmann::json ParseOverflowInfo(char *data_ptr);
template <typename T>
static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr);
static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr,
const std::string &io, uint32_t slot);
#endif
inline static unsigned int starting_graph_id = INT32_MAX;

View File

@ -41,7 +41,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
}
auto file_path = Common::CreatePrefixPath(path);
if (!file_path.has_value()) {
MS_LOG(WARNING) << "CreatePrefixPath failed.";
MS_LOG(WARNING) << "CreatePrefixPath failed, skipping current statistics";
return false;
}
// try to open file
@ -55,7 +55,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
file_.open(file_path_value, std::ios::out | std::ios::app | std::ios::binary);
}
if (!file_.is_open()) {
MS_LOG(WARNING) << "Open file " << path << " failed." << ErrnoToString(errno);
MS_LOG(WARNING) << "Open file " << file_path_value << " failed." << ErrnoToString(errno);
return false;
}
if (first_time_opening) {
@ -63,7 +63,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
file_.flush();
file_path_str_ = path;
}
MS_LOG(INFO) << "Opened file: " << path;
MS_LOG(INFO) << "Opened file: " << file_path_value;
return true;
}
@ -93,9 +93,9 @@ TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op
size_t tensor_loader_slot)
: op_type_{op_type},
op_name_{op_name},
task_id_{task_id},
stream_id_{stream_id},
timestamp_{timestamp},
task_id_{std::to_string(task_id)},
stream_id_{std::to_string(stream_id)},
timestamp_{std::to_string(timestamp)},
slot_{slot},
tensor_loader_slot_{tensor_loader_slot} {
if (input) {
@ -105,6 +105,22 @@ TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op
}
}
TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op_name, const std::string &task_id,
const std::string &stream_id, const std::string &timestamp, const std::string &io,
size_t slot, size_t tensor_loader_slot)
: op_type_{op_type},
op_name_{op_name},
task_id_{task_id},
stream_id_{stream_id},
timestamp_{timestamp},
io_{io},
slot_{slot},
tensor_loader_slot_{tensor_loader_slot} {
if (io_ != kInput && io_ != kOutput) {
MS_LOG(EXCEPTION) << "Cannot instantiate TensorStatDump, io needs to be either " << kInput << " or " << kOutput;
}
}
bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) {
std::string filename = dump_path + "/" + kCsvFileName;
// try to open file
@ -125,16 +141,24 @@ bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) {
bool TensorStatDump::DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path,
const Debugger *debugger) {
if (!OpenStatisticsFile(dump_path)) {
return false;
}
// get tensor statistics using debugger
// get tensor data using debugger
std::string tensor_loader_name = original_kernel_name + ":" + std::to_string(tensor_loader_slot_);
std::shared_ptr<TensorData> data = debugger->GetTensor(tensor_loader_name);
if (data == nullptr) {
MS_LOG(WARNING) << "Failed to find " << tensor_loader_name << " in tensor loader, skipping current statistics";
return false;
}
return DumpTensorStatsToFile(dump_path, data);
}
bool TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, std::shared_ptr<TensorData> data) {
if (data == nullptr) {
MS_LOG(WARNING) << "Tensor data is empty, skipping current statistics";
return false;
}
if (!OpenStatisticsFile(dump_path)) {
return false;
}
const DebugServices::TensorStat &stat = DebugServices::GetTensorStatistics(data);
// write tensor statistics to csv file
std::ostringstream shape;

View File

@ -17,6 +17,7 @@
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
#include <memory>
#include <string>
#include <fstream>
@ -24,6 +25,7 @@
namespace mindspore {
class Debugger;
class TensorData;
class CsvWriter {
public:
static CsvWriter &GetInstance() {
@ -31,13 +33,6 @@ class CsvWriter {
return instance;
}
private:
const std::string kSeparator = ",";
const std::string kEndLine = "\n";
std::ofstream file_;
std::string file_path_str_ = "";
public:
CsvWriter() = default;
~CsvWriter();
DISABLE_COPY_AND_ASSIGN(CsvWriter)
@ -45,28 +40,39 @@ class CsvWriter {
void CloseFile();
template <typename T>
void WriteToCsv(const T &val, bool end_line = false);
private:
const std::string kSeparator = ",";
const std::string kEndLine = "\n";
std::ofstream file_;
std::string file_path_str_ = "";
};
class TensorStatDump {
static const char CSV_HEADER[];
static const char CSV_FILE_NAME[];
const std::string &op_type_;
const std::string &op_name_;
uint32_t task_id_;
uint32_t stream_id_;
uint64_t timestamp_;
std::string io_;
size_t slot_;
size_t tensor_loader_slot_;
public:
static bool OpenStatisticsFile(const std::string &dump_path);
TensorStatDump(const std::string &op_type, const std::string &op_name, uint32_t task_id, uint32_t stream_id,
uint64_t timestamp, bool input, size_t slot, size_t tensor_loader_slot_);
TensorStatDump(const std::string &op_type, const std::string &op_name, const std::string &task_id,
const std::string &stream_id, const std::string &timestamp, const std::string &io, size_t slot,
size_t tensor_loader_slot);
bool DumpTensorStatsToFile(const std::string &dump_path, std::shared_ptr<TensorData> data);
bool DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path,
const Debugger *debugger);
private:
static const char CSV_HEADER[];
static const char CSV_FILE_NAME[];
const std::string op_type_;
const std::string op_name_;
const std::string task_id_;
const std::string stream_id_;
const std::string timestamp_;
std::string io_;
size_t slot_;
size_t tensor_loader_slot_;
};
} // namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_

View File

@ -158,11 +158,15 @@ def generate_statistic_dump_json(dump_path, json_file_name, test_key, saved_data
"""
if test_key == "test_gpu_e2e_dump":
data = e2e_dump_dict
data["common_dump_settings"]["path"] = dump_path
data["common_dump_settings"]["saved_data"] = saved_data
elif test_key == "test_async_dump":
data = async_dump_dict
data["common_dump_settings"]["input_output"] = 0
data["common_dump_settings"]["file_format"] = "npy"
else:
raise ValueError(
"Failed to generate statistic dump json file. The test name value " + test_key + " is invalid.")
data["common_dump_settings"]["path"] = dump_path
data["common_dump_settings"]["saved_data"] = saved_data
with open(json_file_name, 'w') as f:
json.dump(data, f)

View File

@ -409,36 +409,37 @@ def check_statistic_dump(dump_file_path):
real_path = os.path.realpath(output_path)
with open(real_path) as f:
reader = csv.DictReader(f)
input1 = next(reader)
stats = list(reader)
input1 = stats[0]
assert input1['IO'] == 'input'
assert input1['Min Value'] == '1'
assert input1['Max Value'] == '6'
input2 = next(reader)
input2 = stats[1]
assert input2['IO'] == 'input'
assert input2['Min Value'] == '7'
assert input2['Max Value'] == '12'
output = next(reader)
output = stats[2]
assert output['IO'] == 'output'
assert output['Min Value'] == '8'
assert output['Max Value'] == '18'
def check_data_dump(dump_file_path):
output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
output_name = "Add.Add-op*.output.0.*.npy"
output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
real_path = os.path.realpath(output_path)
output = np.load(real_path)
expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
assert np.array_equal(output, expect)
def run_gpu_e2e_dump(saved_data):
"""Run gpu e2e dump"""
def run_saved_data_dump_test(scenario, saved_data):
"""Run e2e dump on scenario, testing statistic dump"""
if sys.platform != 'linux':
return
pwd = os.getcwd()
with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
dump_path = os.path.join(tmp_dir, 'gpu_e2e_dump')
dump_config_path = os.path.join(tmp_dir, 'gpu_e2e_dump.json')
generate_statistic_dump_json(dump_path, dump_config_path, 'test_gpu_e2e_dump', saved_data)
dump_path = os.path.join(tmp_dir, 'test_saved_data')
dump_config_path = os.path.join(tmp_dir, 'test_saved_data.json')
generate_statistic_dump_json(dump_path, dump_config_path, scenario, saved_data)
os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
if os.path.isdir(dump_path):
@ -473,7 +474,7 @@ def test_gpu_e2e_statistic_dump():
Expectation: Statistics are stored in statistic.csv files
"""
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
run_gpu_e2e_dump('statistic')
run_saved_data_dump_test('test_gpu_e2e_dump', 'statistic')
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@ -486,7 +487,7 @@ def test_gpu_e2e_tensor_dump():
Expectation: Tensor data are stored in npy files
"""
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
run_gpu_e2e_dump('tensor')
run_saved_data_dump_test('test_gpu_e2e_dump', 'tensor')
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@ -499,4 +500,46 @@ def test_gpu_e2e_full_dump():
Expectation: Tensor are stored in npy files and their statistics stored in statistic.csv
"""
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
run_gpu_e2e_dump('full')
run_saved_data_dump_test('test_gpu_e2e_dump', 'full')
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_statistic_dump():
"""
Feature: Ascend Statistics Dump
Description: Test Ascend statistics dump
Expectation: Statistics are stored in statistic.csv files
"""
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
run_saved_data_dump_test('test_async_dump', 'statistic')
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_tensor_dump():
"""
Feature: Ascend Tensor Dump
Description: Test Ascend tensor dump
Expectation: Tensors are stored in npy files
"""
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
run_saved_data_dump_test('test_async_dump', 'tensor')
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_full_dump():
"""
Feature: Ascend Full Dump
Description: Test Ascend full dump
Expectation: Tensors are stored in npy files and their statistics stored in statistic.csv
"""
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
run_saved_data_dump_test('test_async_dump', 'full')