mindspore/tests/ut/python/dataset/test_autotune_saveload.py

246 lines
9.8 KiB
Python

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Test Dataset AutoTune's Save and Load Configuration support
"""
import os
import json
import random
import numpy as np
import pytest
import mindspore.dataset as ds
import mindspore.dataset.transforms as transforms
import mindspore.dataset.vision as vision
MNIST_DATA_DIR = "../data/dataset/testMnistData"
DATA_DIR = "../data/dataset/testPK/data"
def data_pipeline_same(file1, file2):
assert file1.exists()
assert file2.exists()
with file1.open() as f1, file2.open() as f2:
pipeline1 = json.load(f1)
pipeline1 = pipeline1["tree"] if "tree" in pipeline1 else pipeline1
pipeline2 = json.load(f2)
pipeline2 = pipeline2["tree"] if "tree" in pipeline2 else pipeline2
return pipeline1 == pipeline2
@pytest.mark.forked
class TestAutotuneSaveLoad:
# Note: Use pytest fixture tmp_path to create files within this temporary directory,
# which is automatically created for each test and deleted at the end of the test.
@staticmethod
def setup_method():
os.environ['RANK_ID'] = str(random.randint(0, 9))
@staticmethod
def teardown_method():
del os.environ['RANK_ID']
@staticmethod
def test_autotune_generator_pipeline(tmp_path):
"""
Feature: Autotuning
Description: Test save final config with GeneratorDataset pipeline: Generator -> Shuffle -> Batch
Expectation: Pipeline runs successfully
"""
original_autotune = ds.config.get_enable_autotune()
ds.config.set_enable_autotune(True, str(tmp_path / "test_autotune_generator_atfinal"))
source = [(np.array([x]),) for x in range(1024)]
data1 = ds.GeneratorDataset(source, ["data"])
data1 = data1.shuffle(64)
data1 = data1.batch(32)
ds.serialize(data1, str(tmp_path / "test_autotune_generator_serialized.json"))
itr = data1.create_dict_iterator(num_epochs=5)
for _ in range(5):
for _ in itr:
pass
del itr
ds.config.set_enable_autotune(original_autotune)
file = tmp_path / ("test_autotune_generator_atfinal_" + os.environ['RANK_ID'] + ".json")
assert file.exists()
@staticmethod
def test_autotune_save_overwrite_generator(tmp_path):
"""
Feature: Autotuning
Description: Test set_enable_autotune and existing json_filepath is overwritten
Expectation: set_enable_autotune() executes successfully with file-exist warning produced.
Execution of 2nd pipeline overwrites AutoTune configuration file of 1st pipeline.
"""
source = [(np.array([x]),) for x in range(1024)]
at_final_json_filename = "test_autotune_save_overwrite_generator_atfinal.json"
original_autotune = ds.config.get_enable_autotune()
ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))
data1 = ds.GeneratorDataset(source, ["data"])
for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
pass
ds.config.set_enable_autotune(False)
ds.config.set_enable_autotune(True, str(tmp_path) + at_final_json_filename)
data2 = ds.GeneratorDataset(source, ["data"])
data2 = data2.shuffle(64)
for _ in data2.create_dict_iterator(num_epochs=1, output_numpy=True):
pass
ds.config.set_enable_autotune(original_autotune)
@staticmethod
def test_autotune_mnist_pipeline(tmp_path):
"""
Feature: Autotuning
Description: Test save final config with Mnist pipeline: Mnist -> Batch -> Map
Expectation: Pipeline runs successfully
"""
original_autotune = ds.config.get_enable_autotune()
ds.config.set_enable_autotune(True, str(tmp_path / "test_autotune_mnist_pipeline_atfinal"))
original_seed = ds.config.get_seed()
ds.config.set_seed(1)
data1 = ds.MnistDataset(MNIST_DATA_DIR, num_samples=100)
one_hot_encode = transforms.OneHot(10) # num_classes is input argument
data1 = data1.map(operations=one_hot_encode, input_columns="label")
data1 = data1.batch(batch_size=10, drop_remainder=True)
ds.serialize(data1, str(tmp_path / "test_autotune_mnist_pipeline_serialized.json"))
for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
pass
ds.config.set_enable_autotune(original_autotune)
# Confirm final AutoTune config file pipeline is identical to the serialized file pipeline.
file1 = tmp_path / ("test_autotune_mnist_pipeline_atfinal_" + os.environ['RANK_ID'] + ".json")
file2 = tmp_path / "test_autotune_mnist_pipeline_serialized.json"
assert data_pipeline_same(file1, file2)
desdata1 = ds.deserialize(json_filepath=str(file1))
desdata2 = ds.deserialize(json_filepath=str(file2))
num = 0
for newdata1, newdata2 in zip(desdata1.create_dict_iterator(num_epochs=1, output_numpy=True),
desdata2.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(newdata1['image'], newdata2['image'])
np.testing.assert_array_equal(newdata1['label'], newdata2['label'])
num += 1
assert num == 10
ds.config.set_seed(original_seed)
@staticmethod
def test_autotune_warning_with_offload(tmp_path, capfd):
"""
Feature: Autotuning
Description: Test autotune config saving with offload=True
Expectation: Autotune should not write the config file and print a log message
"""
original_seed = ds.config.get_seed()
ds.config.set_seed(1)
at_final_json_filename = "test_autotune_warning_with_offload_config.json"
config_path = tmp_path / at_final_json_filename
original_autotune = ds.config.get_enable_autotune()
ds.config.set_enable_autotune(True, str(config_path))
# Dataset with offload activated.
dataset = ds.ImageFolderDataset(DATA_DIR, num_samples=8)
dataset = dataset.map(operations=[vision.Decode()], input_columns="image")
dataset = dataset.map(operations=[vision.HWC2CHW()], input_columns="image", offload=True)
dataset = dataset.batch(8, drop_remainder=True)
for _ in dataset.create_tuple_iterator(num_epochs=1, output_numpy=True):
pass
_, err = capfd.readouterr()
assert "Some nodes have been offloaded. AutoTune is unable to write the autotune configuration to disk. " \
"Disable offload to prevent this from happening." in err
with pytest.raises(FileNotFoundError):
with open(config_path) as _:
pass
ds.config.set_enable_autotune(original_autotune)
ds.config.set_seed(original_seed)
@staticmethod
def test_autotune_save_overwrite_mnist(tmp_path):
"""
Feature: Autotuning
Description: Test set_enable_autotune and existing json_filepath is overwritten
Expectation: set_enable_autotune() executes successfully with file-exist warning produced.
Execution of 2nd pipeline overwrites AutoTune configuration file of 1st pipeline.
"""
original_seed = ds.config.get_seed()
ds.config.set_seed(1)
at_final_json_filename = "test_autotune_save_overwrite_mnist_atfinal"
# Pipeline#1
original_autotune = ds.config.get_enable_autotune()
ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))
data1 = ds.MnistDataset(MNIST_DATA_DIR, num_samples=100)
one_hot_encode = transforms.OneHot(10) # num_classes is input argument
data1 = data1.map(operations=one_hot_encode, input_columns="label")
data1 = data1.batch(batch_size=10, drop_remainder=True)
ds.serialize(data1, str(tmp_path / "test_autotune_save_overwrite_mnist_serialized1.json"))
for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
pass
ds.config.set_enable_autotune(False)
# Pipeline#2
ds.config.set_enable_autotune(True, str(tmp_path / at_final_json_filename))
data1 = ds.MnistDataset(MNIST_DATA_DIR, num_samples=200)
data1 = data1.map(operations=one_hot_encode, input_columns="label")
data1 = data1.shuffle(40)
data1 = data1.batch(batch_size=20, drop_remainder=False)
ds.serialize(data1, str(tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"))
for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
pass
ds.config.set_enable_autotune(False)
# Confirm 2nd serialized file is identical to final AutoTune config file.
file1 = tmp_path / ("test_autotune_save_overwrite_mnist_atfinal_" + os.environ['RANK_ID'] + ".json")
file2 = tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"
assert data_pipeline_same(file1, file2)
# Confirm the serialized files for the 2 different pipelines are different
file1 = tmp_path / "test_autotune_save_overwrite_mnist_serialized1.json"
file2 = tmp_path / "test_autotune_save_overwrite_mnist_serialized2.json"
assert not data_pipeline_same(file1, file2)
ds.config.set_seed(original_seed)
ds.config.set_enable_autotune(original_autotune)