modify the at config file

This commit is contained in:
harshvardhangupta 2022-03-15 13:30:37 -04:00
parent 22240df6c0
commit accc32da6c
5 changed files with 77 additions and 23 deletions

View File

@ -22,6 +22,8 @@
#include <utility>
#include <vector>
#include <string>
#include <sstream>
#include <iomanip>
#ifndef ENABLE_ANDROID
#include "minddata/dataset/engine/datasetops/source/nonmappable_leaf_op.h"
#include "minddata/dataset/engine/serdes.h"
@ -118,7 +120,13 @@ Status AutoTune::SaveAutotuneConfig(const std::string &file_name) {
// The Execution Tree is built by visiting the optimized IR Tree in DFS order.
// So we visit the optimized IR tree in DFS order and try to match each IR node with its corresponding dataset op.
RETURN_IF_NOT_OK(Serdes::UpdateOptimizedIRTreeJSON(&autotune_config_json_, ops_));
RETURN_IF_NOT_OK(Serdes::SaveJSONToFile(autotune_config_json_, file_name));
std::vector<std::string> summary;
RETURN_IF_NOT_OK(SummarizeTreeConfiguration(&summary));
nlohmann::json out_json;
out_json["summary"] = summary;
out_json["pipeline"] = autotune_config_json_;
out_json["remark"] = "The following file has been auto-generated by the Dataset Autotune.";
RETURN_IF_NOT_OK(Serdes::SaveJSONToFile(out_json, file_name, true));
return Status::OK();
}
@ -139,6 +147,23 @@ Status AutoTune::SetAutotuneConfigJson() {
}
#endif
Status AutoTune::SummarizeTreeConfiguration(std::vector<std::string> *out) {
constexpr int op_name_width = 20;
constexpr int val_width = 2;
auto num_ops = ops_.size();
for (int i = 0; i < num_ops; ++i) {
const auto op = ops_[i];
if (!op->inlined() && op->Name() != "DeviceQueueOp") {
std::stringstream s;
s << std::left << std::setw(op_name_width) << op->NameWithID() << "(num_parallel_workers:" << std::right
<< std::setw(val_width) << op->NumWorkers() << ", connector_queue_size:" << std::setw(val_width)
<< op->ConnectorCapacity() << ")";
(void)out->emplace_back(s.str());
}
}
return Status::OK();
}
void AutoTune::PrintTreeConfiguration() const {
ExecutionTree const *tree = tree_adapter_->tree_.get();
for (auto itr = tree->begin(); itr != tree->end(); itr++) {

View File

@ -53,6 +53,11 @@ class AutoTune {
/// \brief Helper to print the tree configuration
void PrintTreeConfiguration() const;
/// \brief Helper to summarize the execution tree
/// \param[out] out An output vector of string to store the summary
/// \return Status object
Status SummarizeTreeConfiguration(std::vector<std::string> *out);
#ifndef ENABLE_ANDROID
/// \brief Serialize the dataset and save the AT config (workers and queue size) to a json file
/// \param file_name Name of the file

View File

@ -15,6 +15,7 @@
*/
#include <fstream>
#include <stack>
#include <iomanip>
#include "minddata/dataset/engine/serdes.h"
#include "minddata/dataset/core/pybind_support.h"
@ -62,7 +63,8 @@ Status Serdes::SaveToJSON(std::shared_ptr<DatasetNode> node, const std::string &
return Status::OK();
}
Status Serdes::SaveJSONToFile(const nlohmann::json &json_string, const std::string &file_name) {
Status Serdes::SaveJSONToFile(const nlohmann::json &json_string, const std::string &file_name, bool pretty) {
constexpr int field_width = 4;
try {
std::optional<std::string> dir = "";
std::optional<std::string> local_file_name = "";
@ -80,7 +82,10 @@ Status Serdes::SaveJSONToFile(const nlohmann::json &json_string, const std::stri
FileUtils::ConcatDirAndFileName(&realpath, &local_file_name, &whole_path);
std::ofstream file(whole_path.value());
file << json_string;
if (pretty) {
file << std::setw(field_width);
}
file << json_string << std::endl;
file.close();
ChangeFileMode(whole_path.value(), S_IRUSR | S_IWUSR);
@ -104,6 +109,10 @@ Status Serdes::Deserialize(const std::string &json_filepath, std::shared_ptr<Dat
"Invalid file, failed to parse json file: " + json_filepath + ", error message: " + e.what());
}
json_in.close();
// Handle config generated by dataset autotune
if (json_obj.find("pipeline") != json_obj.end()) {
json_obj = json_obj["pipeline"];
}
RETURN_IF_NOT_OK(ConstructPipeline(json_obj, ds));
return Status::OK();
}

View File

@ -195,8 +195,9 @@ class Serdes {
/// \brief Helper function to save JSON to a file
/// \param[in] json_string The JSON string to be saved to the file
/// \param[in] file_name The file name
/// \param[in] pretty Flag to control pretty printing of JSON string to the file
/// \return Status The status code returned
static Status SaveJSONToFile(const nlohmann::json &json_string, const std::string &file_name);
static Status SaveJSONToFile(const nlohmann::json &json_string, const std::string &file_name, bool pretty = false);
protected:
/// \brief Function to determine type of the node - dataset node if no dataset exists or operation node

View File

@ -15,7 +15,7 @@
"""
Test Dataset AutoTune's Save and Load Configuration support
"""
import filecmp
import json
import numpy as np
import pytest
@ -27,6 +27,17 @@ MNIST_DATA_DIR = "../data/dataset/testMnistData"
DATA_DIR = "../data/dataset/testPK/data"
def data_pipeline_same(file1, file2):
assert file1.exists()
assert file2.exists()
with file1.open() as f1, file2.open() as f2:
pipeline1 = json.load(f1)
pipeline1 = pipeline1["pipeline"] if "pipeline" in pipeline1 else pipeline1
pipeline2 = json.load(f2)
pipeline2 = pipeline2["pipeline"] if "pipeline" in pipeline2 else pipeline2
return pipeline1 == pipeline2
@pytest.mark.forked
class TestAutotuneSaveLoad:
# Note: Use pytest fixture tmp_path to create files within this temporary directory,
@ -40,14 +51,14 @@ class TestAutotuneSaveLoad:
Expectation: pipeline runs successfully
"""
original_autotune = ds.config.get_enable_autotune()
ds.config.set_enable_autotune(True, str(tmp_path) + "test_autotune_generator_atfinal.json")
ds.config.set_enable_autotune(True, str(tmp_path / "test_autotune_generator_atfinal.json"))
source = [(np.array([x]),) for x in range(1024)]
data1 = ds.GeneratorDataset(source, ["data"])
data1 = data1.shuffle(64)
data1 = data1.batch(32)
ds.serialize(data1, str(tmp_path) + "test_autotune_generator_serialized.json")
ds.serialize(data1, str(tmp_path / "test_autotune_generator_serialized.json"))
itr = data1.create_dict_iterator(num_epochs=5)
for _ in range(5):
@ -64,7 +75,7 @@ class TestAutotuneSaveLoad:
Expectation: pipeline runs successfully
"""
original_autotune = ds.config.get_enable_autotune()
ds.config.set_enable_autotune(True, str(tmp_path) + "test_autotune_mnist_pipeline_atfinal.json")
ds.config.set_enable_autotune(True, str(tmp_path / "test_autotune_mnist_pipeline_atfinal.json"))
original_seed = ds.config.get_seed()
ds.config.set_seed(1)
@ -74,19 +85,20 @@ class TestAutotuneSaveLoad:
data1 = data1.batch(batch_size=10, drop_remainder=True)
ds.serialize(data1, str(tmp_path) + "test_autotune_mnist_pipeline_serialized.json")
ds.serialize(data1, str(tmp_path / "test_autotune_mnist_pipeline_serialized.json"))
for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
pass
ds.config.set_enable_autotune(original_autotune)
# Confirm final AutoTune config file is identical to the serialized file.
assert filecmp.cmp(str(tmp_path) + "test_autotune_mnist_pipeline_atfinal.json",
str(tmp_path) + "test_autotune_mnist_pipeline_serialized.json")
# Confirm final AutoTune config file pipeline is identical to the serialized file pipeline.
file1 = tmp_path / "test_autotune_mnist_pipeline_atfinal.json"
file2 = tmp_path / "test_autotune_mnist_pipeline_serialized.json"
assert data_pipeline_same(file1, file2)
desdata1 = ds.deserialize(json_filepath=str(tmp_path) + "test_autotune_mnist_pipeline_atfinal.json")
desdata2 = ds.deserialize(json_filepath=str(tmp_path) + "test_autotune_mnist_pipeline_serialized.json")
desdata1 = ds.deserialize(json_filepath=str(tmp_path / "test_autotune_mnist_pipeline_atfinal.json"))
desdata2 = ds.deserialize(json_filepath=str(tmp_path / "test_autotune_mnist_pipeline_serialized.json"))
num = 0
for newdata1, newdata2 in zip(desdata1.create_dict_iterator(num_epochs=1, output_numpy=True),
@ -110,7 +122,7 @@ class TestAutotuneSaveLoad:
at_final_json_filename = "test_autotune_save_overwrite_generator_atfinal.json"
original_autotune = ds.config.get_enable_autotune()
ds.config.set_enable_autotune(True, str(tmp_path) + at_final_json_filename)
ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))
data1 = ds.GeneratorDataset(source, ["data"])
@ -143,14 +155,14 @@ class TestAutotuneSaveLoad:
# Pipeline#1
original_autotune = ds.config.get_enable_autotune()
ds.config.set_enable_autotune(True, str(tmp_path) + at_final_json_filename)
ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))
data1 = ds.MnistDataset(MNIST_DATA_DIR, num_samples=100)
one_hot_encode = c_transforms.OneHot(10) # num_classes is input argument
data1 = data1.map(operations=one_hot_encode, input_columns="label")
data1 = data1.batch(batch_size=10, drop_remainder=True)
ds.serialize(data1, str(tmp_path) + "test_autotune_save_overwrite_mnist_serialized1.json")
ds.serialize(data1, str(tmp_path / "test_autotune_save_overwrite_mnist_serialized1.json"))
for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
pass
@ -158,14 +170,14 @@ class TestAutotuneSaveLoad:
ds.config.set_enable_autotune(False)
# Pipeline#2
ds.config.set_enable_autotune(True, str(tmp_path) + at_final_json_filename)
ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))
data1 = ds.MnistDataset(MNIST_DATA_DIR, num_samples=200)
data1 = data1.map(operations=one_hot_encode, input_columns="label")
data1 = data1.shuffle(40)
data1 = data1.batch(batch_size=20, drop_remainder=False)
ds.serialize(data1, str(tmp_path) + "test_autotune_save_overwrite_mnist_serialized2.json")
ds.serialize(data1, str(tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"))
for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
pass
@ -173,12 +185,14 @@ class TestAutotuneSaveLoad:
ds.config.set_enable_autotune(False)
# Confirm 2nd serialized file is identical to final AutoTune config file.
assert filecmp.cmp(str(tmp_path) + "test_autotune_save_overwrite_mnist_atfinal.json",
str(tmp_path) + "test_autotune_save_overwrite_mnist_serialized2.json")
file1 = tmp_path / "test_autotune_save_overwrite_mnist_atfinal.json"
file2 = tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"
assert data_pipeline_same(file1, file2)
# Confirm the serialized files for the 2 different pipelines are different
assert not filecmp.cmp(str(tmp_path) + "test_autotune_save_overwrite_mnist_serialized1.json",
str(tmp_path) + "test_autotune_save_overwrite_mnist_serialized2.json")
file1 = tmp_path / "test_autotune_save_overwrite_mnist_serialized1.json"
file2 = tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"
assert not data_pipeline_same(file1, file2)
ds.config.set_seed(original_seed)
ds.config.set_enable_autotune(original_autotune)