mindspore/tests/ut/python/dataset/test_autotune_saveload.py

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Test Dataset AutoTune's Save and Load Configuration support
"""
import os
import json
import random
import numpy as np
import pytest
import mindspore.dataset as ds
import mindspore.dataset.transforms as transforms
import mindspore.dataset.vision as vision

MNIST_DATA_DIR = "../data/dataset/testMnistData"
DATA_DIR = "../data/dataset/testPK/data"


def data_pipeline_same(file1, file2):
    assert file1.exists()
    assert file2.exists()
    with file1.open() as f1, file2.open() as f2:
        pipeline1 = json.load(f1)
        pipeline1 = pipeline1["tree"] if "tree" in pipeline1 else pipeline1
        pipeline2 = json.load(f2)
        pipeline2 = pipeline2["tree"] if "tree" in pipeline2 else pipeline2
        return pipeline1 == pipeline2


@pytest.mark.forked
class TestAutotuneSaveLoad:
    # Note: Use pytest fixture tmp_path to create files within this temporary directory,
    # which is automatically created for each test and deleted at the end of the test.

    @staticmethod
    def setup_method():
        os.environ['RANK_ID'] = str(random.randint(0, 9))

    @staticmethod
    def teardown_method():
        del os.environ['RANK_ID']

    @staticmethod
    def test_autotune_generator_pipeline(tmp_path):
        """
        Feature: Autotuning
        Description: Test save final config with GeneratorDataset pipeline: Generator -> Shuffle -> Batch
        Expectation: Pipeline runs successfully
        """
        original_autotune = ds.config.get_enable_autotune()
        ds.config.set_enable_autotune(True, str(tmp_path / "test_autotune_generator_atfinal"))

        source = [(np.array([x]),) for x in range(1024)]
        data1 = ds.GeneratorDataset(source, ["data"])
        data1 = data1.shuffle(64)
        data1 = data1.batch(32)

        ds.serialize(data1, str(tmp_path / "test_autotune_generator_serialized.json"))

        itr = data1.create_dict_iterator(num_epochs=5)
        for _ in range(5):
            for _ in itr:
                pass
        del itr
        ds.config.set_enable_autotune(original_autotune)

        file = tmp_path / ("test_autotune_generator_atfinal_" + os.environ['RANK_ID'] + ".json")
        assert file.exists()

    @staticmethod
    def test_autotune_save_overwrite_generator(tmp_path):
        """
        Feature: Autotuning
        Description: Test set_enable_autotune and existing json_filepath is overwritten
        Expectation: set_enable_autotune() executes successfully with file-exist warning produced.
            Execution of 2nd pipeline overwrites AutoTune configuration file of 1st pipeline.
        """
        source = [(np.array([x]),) for x in range(1024)]

        at_final_json_filename = "test_autotune_save_overwrite_generator_atfinal.json"
        original_autotune = ds.config.get_enable_autotune()
        ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))

        data1 = ds.GeneratorDataset(source, ["data"])

        for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass

        ds.config.set_enable_autotune(False)

        ds.config.set_enable_autotune(True, str(tmp_path) + at_final_json_filename)

        data2 = ds.GeneratorDataset(source, ["data"])
        data2 = data2.shuffle(64)

        for _ in data2.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass

        ds.config.set_enable_autotune(original_autotune)

    @staticmethod
    def test_autotune_mnist_pipeline(tmp_path):
        """
        Feature: Autotuning
        Description: Test save final config with Mnist pipeline: Mnist -> Batch -> Map
        Expectation: Pipeline runs successfully
        """
        original_autotune = ds.config.get_enable_autotune()
        ds.config.set_enable_autotune(True, str(tmp_path / "test_autotune_mnist_pipeline_atfinal"))
        original_seed = ds.config.get_seed()
        ds.config.set_seed(1)

        data1 = ds.MnistDataset(MNIST_DATA_DIR, num_samples=100)
        one_hot_encode = transforms.OneHot(10)  # num_classes is input argument
        data1 = data1.map(operations=one_hot_encode, input_columns="label")

        data1 = data1.batch(batch_size=10, drop_remainder=True)

        ds.serialize(data1, str(tmp_path / "test_autotune_mnist_pipeline_serialized.json"))

        for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass

        ds.config.set_enable_autotune(original_autotune)

        # Confirm final AutoTune config file pipeline is identical to the serialized file pipeline.
        file1 = tmp_path / ("test_autotune_mnist_pipeline_atfinal_" + os.environ['RANK_ID'] + ".json")
        file2 = tmp_path / "test_autotune_mnist_pipeline_serialized.json"
        assert data_pipeline_same(file1, file2)

        desdata1 = ds.deserialize(json_filepath=str(file1))
        desdata2 = ds.deserialize(json_filepath=str(file2))

        num = 0
        for newdata1, newdata2 in zip(desdata1.create_dict_iterator(num_epochs=1, output_numpy=True),
                                      desdata2.create_dict_iterator(num_epochs=1, output_numpy=True)):
            np.testing.assert_array_equal(newdata1['image'], newdata2['image'])
            np.testing.assert_array_equal(newdata1['label'], newdata2['label'])
            num += 1
        assert num == 10

        ds.config.set_seed(original_seed)

    @staticmethod
    def test_autotune_warning_with_offload(tmp_path, capfd):
        """
        Feature: Autotuning
        Description: Test autotune config saving with offload=True
        Expectation: Autotune should not write the config file and print a log message
        """
        original_seed = ds.config.get_seed()
        ds.config.set_seed(1)
        at_final_json_filename = "test_autotune_warning_with_offload_config.json"
        config_path = tmp_path / at_final_json_filename
        original_autotune = ds.config.get_enable_autotune()
        ds.config.set_enable_autotune(True, str(config_path))

        # Dataset with offload activated.
        dataset = ds.ImageFolderDataset(DATA_DIR, num_samples=8)
        dataset = dataset.map(operations=[vision.Decode()], input_columns="image")
        dataset = dataset.map(operations=[vision.HWC2CHW()], input_columns="image", offload=True)
        dataset = dataset.batch(8, drop_remainder=True)

        for _ in dataset.create_tuple_iterator(num_epochs=1, output_numpy=True):
            pass

        _, err = capfd.readouterr()

        assert "Some nodes have been offloaded. AutoTune is unable to write the autotune configuration to disk. " \
               "Disable offload to prevent this from happening." in err

        with pytest.raises(FileNotFoundError):
            with open(config_path) as _:
                pass

        ds.config.set_enable_autotune(original_autotune)
        ds.config.set_seed(original_seed)

    @staticmethod
    def test_autotune_save_overwrite_mnist(tmp_path):
        """
        Feature: Autotuning
        Description: Test set_enable_autotune and existing json_filepath is overwritten
        Expectation: set_enable_autotune() executes successfully with file-exist warning produced.
            Execution of 2nd pipeline overwrites AutoTune configuration file of 1st pipeline.
        """
        original_seed = ds.config.get_seed()
        ds.config.set_seed(1)
        at_final_json_filename = "test_autotune_save_overwrite_mnist_atfinal"

        # Pipeline#1
        original_autotune = ds.config.get_enable_autotune()
        ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))

        data1 = ds.MnistDataset(MNIST_DATA_DIR, num_samples=100)
        one_hot_encode = transforms.OneHot(10)  # num_classes is input argument
        data1 = data1.map(operations=one_hot_encode, input_columns="label")
        data1 = data1.batch(batch_size=10, drop_remainder=True)

        ds.serialize(data1, str(tmp_path / "test_autotune_save_overwrite_mnist_serialized1.json"))

        for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass

        ds.config.set_enable_autotune(False)

        # Pipeline#2
        ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))

        data1 = ds.MnistDataset(MNIST_DATA_DIR, num_samples=200)
        data1 = data1.map(operations=one_hot_encode, input_columns="label")
        data1 = data1.shuffle(40)
        data1 = data1.batch(batch_size=20, drop_remainder=False)

        ds.serialize(data1, str(tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"))

        for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass

        ds.config.set_enable_autotune(False)

        # Confirm 2nd serialized file is identical to final AutoTune config file.
        file1 = tmp_path / ("test_autotune_save_overwrite_mnist_atfinal_" + os.environ['RANK_ID'] + ".json")
        file2 = tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"
        assert data_pipeline_same(file1, file2)

        # Confirm the serialized files for the 2 different pipelines are different
        file1 = tmp_path / "test_autotune_save_overwrite_mnist_serialized1.json"
        file2 = tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"
        assert not data_pipeline_same(file1, file2)

        ds.config.set_seed(original_seed)
        ds.config.set_enable_autotune(original_autotune)