modify the at config file

2022-03-15 13:30:37 -04:00 · 2022-03-15 13:30:37 -04:00 · accc32da6c
parent 22240df6c0
commit accc32da6c
5 changed files with 77 additions and 23 deletions
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/auto_tune.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/auto_tune.cc
@ -22,6 +22,8 @@
 #include <utility>
 #include <vector>
 #include <string>
+#include <sstream>
+#include <iomanip>
 #ifndef ENABLE_ANDROID
 #include "minddata/dataset/engine/datasetops/source/nonmappable_leaf_op.h"
 #include "minddata/dataset/engine/serdes.h"
@ -118,7 +120,13 @@ Status AutoTune::SaveAutotuneConfig(const std::string &file_name) {
  // The Execution Tree is built by visiting the optimized IR Tree in DFS order.
  // So we visit the optimized IR tree in DFS order and try to match each IR node with its corresponding dataset op.
  RETURN_IF_NOT_OK(Serdes::UpdateOptimizedIRTreeJSON(&autotune_config_json_, ops_));
-  RETURN_IF_NOT_OK(Serdes::SaveJSONToFile(autotune_config_json_, file_name));
+  std::vector<std::string> summary;
+  RETURN_IF_NOT_OK(SummarizeTreeConfiguration(&summary));
+  nlohmann::json out_json;
+  out_json["summary"] = summary;
+  out_json["pipeline"] = autotune_config_json_;
+  out_json["remark"] = "The following file has been auto-generated by the Dataset Autotune.";
+  RETURN_IF_NOT_OK(Serdes::SaveJSONToFile(out_json, file_name, true));
  return Status::OK();
 }

@ -139,6 +147,23 @@ Status AutoTune::SetAutotuneConfigJson() {
 }
 #endif

+Status AutoTune::SummarizeTreeConfiguration(std::vector<std::string> *out) {
+  constexpr int op_name_width = 20;
+  constexpr int val_width = 2;
+  auto num_ops = ops_.size();
+  for (int i = 0; i < num_ops; ++i) {
+    const auto op = ops_[i];
+    if (!op->inlined() && op->Name() != "DeviceQueueOp") {
+      std::stringstream s;
+      s << std::left << std::setw(op_name_width) << op->NameWithID() << "(num_parallel_workers:" << std::right
+        << std::setw(val_width) << op->NumWorkers() << ", connector_queue_size:" << std::setw(val_width)
+        << op->ConnectorCapacity() << ")";
+      (void)out->emplace_back(s.str());
+    }
+  }
+  return Status::OK();
+}
+
 void AutoTune::PrintTreeConfiguration() const {
  ExecutionTree const *tree = tree_adapter_->tree_.get();
  for (auto itr = tree->begin(); itr != tree->end(); itr++) {
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/auto_tune.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/auto_tune.h
@ -53,6 +53,11 @@ class AutoTune {
  /// \brief Helper to print the tree configuration
  void PrintTreeConfiguration() const;

+  /// \brief Helper to summarize the execution tree
+  /// \param[out] out An output vector of string to store the summary
+  /// \return Status object
+  Status SummarizeTreeConfiguration(std::vector<std::string> *out);
+
 #ifndef ENABLE_ANDROID
  /// \brief Serialize the dataset and save the AT config (workers and queue size) to a json file
  /// \param file_name Name of the file
--- a/mindspore/ccsrc/minddata/dataset/engine/serdes.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/serdes.cc
@ -15,6 +15,7 @@
 */
 #include <fstream>
 #include <stack>
+#include <iomanip>
 #include "minddata/dataset/engine/serdes.h"

 #include "minddata/dataset/core/pybind_support.h"
@ -62,7 +63,8 @@ Status Serdes::SaveToJSON(std::shared_ptr<DatasetNode> node, const std::string &
  return Status::OK();
 }

-Status Serdes::SaveJSONToFile(const nlohmann::json &json_string, const std::string &file_name) {
+Status Serdes::SaveJSONToFile(const nlohmann::json &json_string, const std::string &file_name, bool pretty) {
+  constexpr int field_width = 4;
  try {
    std::optional<std::string> dir = "";
    std::optional<std::string> local_file_name = "";
@ -80,7 +82,10 @@ Status Serdes::SaveJSONToFile(const nlohmann::json &json_string, const std::stri
    FileUtils::ConcatDirAndFileName(&realpath, &local_file_name, &whole_path);

    std::ofstream file(whole_path.value());
-    file << json_string;
+    if (pretty) {
+      file << std::setw(field_width);
+    }
+    file << json_string << std::endl;
    file.close();

    ChangeFileMode(whole_path.value(), S_IRUSR | S_IWUSR);
@ -104,6 +109,10 @@ Status Serdes::Deserialize(const std::string &json_filepath, std::shared_ptr<Dat
                  "Invalid file, failed to parse json file: " + json_filepath + ", error message: " + e.what());
  }
  json_in.close();
+  // Handle config generated by dataset autotune
+  if (json_obj.find("pipeline") != json_obj.end()) {
+    json_obj = json_obj["pipeline"];
+  }
  RETURN_IF_NOT_OK(ConstructPipeline(json_obj, ds));
  return Status::OK();
 }
--- a/mindspore/ccsrc/minddata/dataset/engine/serdes.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/serdes.h
@ -195,8 +195,9 @@ class Serdes {
  /// \brief Helper function to save JSON to a file
  /// \param[in] json_string The JSON string to be saved to the file
  /// \param[in] file_name The file name
+  /// \param[in] pretty Flag to control pretty printing of JSON string to the file
  /// \return Status The status code returned
-  static Status SaveJSONToFile(const nlohmann::json &json_string, const std::string &file_name);
+  static Status SaveJSONToFile(const nlohmann::json &json_string, const std::string &file_name, bool pretty = false);

 protected:
  /// \brief Function to determine type of the node - dataset node if no dataset exists or operation node
--- a/tests/ut/python/dataset/test_autotune_saveload.py
+++ b/tests/ut/python/dataset/test_autotune_saveload.py
@ -15,7 +15,7 @@
 """
 Test Dataset AutoTune's Save and Load Configuration support
 """
-import filecmp
+import json

 import numpy as np
 import pytest
@ -27,6 +27,17 @@ MNIST_DATA_DIR = "../data/dataset/testMnistData"
 DATA_DIR = "../data/dataset/testPK/data"


+def data_pipeline_same(file1, file2):
+    assert file1.exists()
+    assert file2.exists()
+    with file1.open() as f1, file2.open() as f2:
+        pipeline1 = json.load(f1)
+        pipeline1 = pipeline1["pipeline"] if "pipeline" in pipeline1 else pipeline1
+        pipeline2 = json.load(f2)
+        pipeline2 = pipeline2["pipeline"] if "pipeline" in pipeline2 else pipeline2
+        return pipeline1 == pipeline2
+
+
@pytest.mark.forked
 class TestAutotuneSaveLoad:
    # Note: Use pytest fixture tmp_path to create files within this temporary directory,
@ -40,14 +51,14 @@ class TestAutotuneSaveLoad:
        Expectation: pipeline runs successfully
        """
        original_autotune = ds.config.get_enable_autotune()
-        ds.config.set_enable_autotune(True, str(tmp_path) + "test_autotune_generator_atfinal.json")
+        ds.config.set_enable_autotune(True, str(tmp_path / "test_autotune_generator_atfinal.json"))

        source = [(np.array([x]),) for x in range(1024)]
        data1 = ds.GeneratorDataset(source, ["data"])
        data1 = data1.shuffle(64)
        data1 = data1.batch(32)

-        ds.serialize(data1, str(tmp_path) + "test_autotune_generator_serialized.json")
+        ds.serialize(data1, str(tmp_path / "test_autotune_generator_serialized.json"))

        itr = data1.create_dict_iterator(num_epochs=5)
        for _ in range(5):
@ -64,7 +75,7 @@ class TestAutotuneSaveLoad:
        Expectation: pipeline runs successfully
        """
        original_autotune = ds.config.get_enable_autotune()
-        ds.config.set_enable_autotune(True, str(tmp_path) + "test_autotune_mnist_pipeline_atfinal.json")
+        ds.config.set_enable_autotune(True, str(tmp_path / "test_autotune_mnist_pipeline_atfinal.json"))
        original_seed = ds.config.get_seed()
        ds.config.set_seed(1)

@ -74,19 +85,20 @@ class TestAutotuneSaveLoad:

        data1 = data1.batch(batch_size=10, drop_remainder=True)

-        ds.serialize(data1, str(tmp_path) + "test_autotune_mnist_pipeline_serialized.json")
+        ds.serialize(data1, str(tmp_path / "test_autotune_mnist_pipeline_serialized.json"))

        for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass

        ds.config.set_enable_autotune(original_autotune)

-        # Confirm final AutoTune config file is identical to the serialized file.
-        assert filecmp.cmp(str(tmp_path) + "test_autotune_mnist_pipeline_atfinal.json",
-                           str(tmp_path) + "test_autotune_mnist_pipeline_serialized.json")
+        # Confirm final AutoTune config file pipeline is identical to the serialized file pipeline.
+        file1 = tmp_path / "test_autotune_mnist_pipeline_atfinal.json"
+        file2 = tmp_path / "test_autotune_mnist_pipeline_serialized.json"
+        assert data_pipeline_same(file1, file2)

-        desdata1 = ds.deserialize(json_filepath=str(tmp_path) + "test_autotune_mnist_pipeline_atfinal.json")
-        desdata2 = ds.deserialize(json_filepath=str(tmp_path) + "test_autotune_mnist_pipeline_serialized.json")
+        desdata1 = ds.deserialize(json_filepath=str(tmp_path / "test_autotune_mnist_pipeline_atfinal.json"))
+        desdata2 = ds.deserialize(json_filepath=str(tmp_path / "test_autotune_mnist_pipeline_serialized.json"))

        num = 0
        for newdata1, newdata2 in zip(desdata1.create_dict_iterator(num_epochs=1, output_numpy=True),
@ -110,7 +122,7 @@ class TestAutotuneSaveLoad:

        at_final_json_filename = "test_autotune_save_overwrite_generator_atfinal.json"
        original_autotune = ds.config.get_enable_autotune()
-        ds.config.set_enable_autotune(True, str(tmp_path) + at_final_json_filename)
+        ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))

        data1 = ds.GeneratorDataset(source, ["data"])

@ -143,14 +155,14 @@ class TestAutotuneSaveLoad:

        # Pipeline#1
        original_autotune = ds.config.get_enable_autotune()
-        ds.config.set_enable_autotune(True, str(tmp_path) + at_final_json_filename)
+        ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))

        data1 = ds.MnistDataset(MNIST_DATA_DIR, num_samples=100)
        one_hot_encode = c_transforms.OneHot(10)  # num_classes is input argument
        data1 = data1.map(operations=one_hot_encode, input_columns="label")
        data1 = data1.batch(batch_size=10, drop_remainder=True)

-        ds.serialize(data1, str(tmp_path) + "test_autotune_save_overwrite_mnist_serialized1.json")
+        ds.serialize(data1, str(tmp_path / "test_autotune_save_overwrite_mnist_serialized1.json"))

        for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass
@ -158,14 +170,14 @@ class TestAutotuneSaveLoad:
        ds.config.set_enable_autotune(False)

        # Pipeline#2
-        ds.config.set_enable_autotune(True, str(tmp_path) + at_final_json_filename)
+        ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))

        data1 = ds.MnistDataset(MNIST_DATA_DIR, num_samples=200)
        data1 = data1.map(operations=one_hot_encode, input_columns="label")
        data1 = data1.shuffle(40)
        data1 = data1.batch(batch_size=20, drop_remainder=False)

-        ds.serialize(data1, str(tmp_path) + "test_autotune_save_overwrite_mnist_serialized2.json")
+        ds.serialize(data1, str(tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"))

        for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass
@ -173,12 +185,14 @@ class TestAutotuneSaveLoad:
        ds.config.set_enable_autotune(False)

        # Confirm 2nd serialized file is identical to final AutoTune config file.
-        assert filecmp.cmp(str(tmp_path) + "test_autotune_save_overwrite_mnist_atfinal.json",
-                           str(tmp_path) + "test_autotune_save_overwrite_mnist_serialized2.json")
+        file1 = tmp_path / "test_autotune_save_overwrite_mnist_atfinal.json"
+        file2 = tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"
+        assert data_pipeline_same(file1, file2)

        # Confirm the serialized files for the 2 different pipelines are different
-        assert not filecmp.cmp(str(tmp_path) + "test_autotune_save_overwrite_mnist_serialized1.json",
-                               str(tmp_path) + "test_autotune_save_overwrite_mnist_serialized2.json")
+        file1 = tmp_path / "test_autotune_save_overwrite_mnist_serialized1.json"
+        file2 = tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"
+        assert not data_pipeline_same(file1, file2)

        ds.config.set_seed(original_seed)
        ds.config.set_enable_autotune(original_autotune)