diff --git a/mindspore/ccsrc/minddata/dataset/api/samplers.cc b/mindspore/ccsrc/minddata/dataset/api/samplers.cc index 0a39dc6cb71..32190260753 100644 --- a/mindspore/ccsrc/minddata/dataset/api/samplers.cc +++ b/mindspore/ccsrc/minddata/dataset/api/samplers.cc @@ -29,6 +29,14 @@ namespace mindspore { namespace dataset { +Status Sampler::BuildChildren(std::shared_ptr *const sampler) const { + for (const auto &child : children_) { + std::shared_ptr sampler_obj = child->Parse(); + RETURN_IF_NOT_OK((*sampler)->AddChildSampler(sampler_obj)); + } + return Status::OK(); +} + // DistributedSampler DistributedSampler::DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples, uint32_t seed, int64_t offset, bool even_dist) @@ -41,8 +49,13 @@ DistributedSampler::DistributedSampler(int64_t num_shards, int64_t shard_id, boo even_dist_(even_dist) {} std::shared_ptr DistributedSampler::Parse() const { - return std::make_shared(num_shards_, shard_id_, shuffle_, num_samples_, seed_, offset_, - even_dist_); + std::shared_ptr output = + std::make_shared(num_shards_, shard_id_, shuffle_, num_samples_, seed_, offset_, even_dist_); + Status s = BuildChildren(&output); + if (s.IsError()) { + MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s; + } + return output; } // PKSampler @@ -50,7 +63,12 @@ PKSampler::PKSampler(int64_t num_val, bool shuffle, int64_t num_samples) : num_val_(num_val), shuffle_(shuffle), num_samples_(num_samples) {} std::shared_ptr PKSampler::Parse() const { - return std::make_shared(num_val_, shuffle_, num_samples_); + std::shared_ptr output = std::make_shared(num_val_, shuffle_, num_samples_); + Status s = BuildChildren(&output); + if (s.IsError()) { + MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s; + } + return output; } // RandomSampler @@ -58,7 +76,12 @@ RandomSampler::RandomSampler(bool replacement, int64_t num_samples) : replacement_(replacement), num_samples_(num_samples) {} std::shared_ptr RandomSampler::Parse() const { - return std::make_shared(replacement_, num_samples_); + std::shared_ptr output = std::make_shared(replacement_, num_samples_); + Status s = BuildChildren(&output); + if (s.IsError()) { + MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s; + } + return output; } // SequentialSampler @@ -66,7 +89,12 @@ SequentialSampler::SequentialSampler(int64_t start_index, int64_t num_samples) : start_index_(start_index), num_samples_(num_samples) {} std::shared_ptr SequentialSampler::Parse() const { - return std::make_shared(start_index_, num_samples_); + std::shared_ptr output = std::make_shared(start_index_, num_samples_); + Status s = BuildChildren(&output); + if (s.IsError()) { + MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s; + } + return output; } // SubsetSampler @@ -74,7 +102,12 @@ SubsetSampler::SubsetSampler(const std::vector &indices, int64_t num_sa : indices_(indices), num_samples_(num_samples) {} std::shared_ptr SubsetSampler::Parse() const { - return std::make_shared(indices_, num_samples_); + std::shared_ptr output = std::make_shared(indices_, num_samples_); + Status s = BuildChildren(&output); + if (s.IsError()) { + MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s; + } + return output; } // SubsetRandomSampler @@ -82,7 +115,12 @@ SubsetRandomSampler::SubsetRandomSampler(const std::vector &indices, in : SubsetSampler(indices, num_samples) {} std::shared_ptr SubsetRandomSampler::Parse() const { - return std::make_shared(indices_, num_samples_); + std::shared_ptr output = std::make_shared(indices_, num_samples_); + Status s = BuildChildren(&output); + if (s.IsError()) { + MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s; + } + return output; } // WeightedRandomSampler @@ -90,7 +128,12 @@ WeightedRandomSampler::WeightedRandomSampler(const std::vector &weights, : weights_(weights), num_samples_(num_samples), replacement_(replacement) {} std::shared_ptr WeightedRandomSampler::Parse() const { - return std::make_shared(weights_, num_samples_, replacement_); + std::shared_ptr output = std::make_shared(weights_, num_samples_, replacement_); + Status s = BuildChildren(&output); + if (s.IsError()) { + MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s; + } + return output; } } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h b/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h index 9da88422d1f..30cb79ad8a5 100644 --- a/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h +++ b/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h @@ -21,6 +21,7 @@ #include #include "include/api/types.h" +#include "include/api/status.h" namespace mindspore { namespace dataset { @@ -88,6 +89,11 @@ class MS_API Sampler : std::enable_shared_from_this { /// \return shared pointer to the newly created TensorOperation. virtual std::shared_ptr Parse() const = 0; + /// \brief A function that calls Parse() on the children of this sampler + /// \param[in] sampler The samplerIR object built from this sampler + /// \return the Status code returned + Status BuildChildren(std::shared_ptr *const sampler) const; + std::vector> children_; }; diff --git a/mindspore/python/mindspore/dataset/engine/samplers.py b/mindspore/python/mindspore/dataset/engine/samplers.py index acb3dd886e0..e3938b61484 100644 --- a/mindspore/python/mindspore/dataset/engine/samplers.py +++ b/mindspore/python/mindspore/dataset/engine/samplers.py @@ -105,8 +105,8 @@ class BuiltinSampler: def add_child(self, sampler): """ - Add a sub-sampler for given sampler. The sub-sampler will receive all data from the - output of parent sampler and apply its sample logic to return new samples. + Add a sub-sampler for given sampler. The parent will receive all data from the + output of sub-sampler sampler and apply its sample logic to return new samples. Args: sampler (Sampler): Object used to choose samples from the dataset. Only builtin @@ -115,9 +115,11 @@ class BuiltinSampler: Examples: >>> sampler = ds.SequentialSampler(start_index=0, num_samples=3) - >>> sampler.add_child(ds.RandomSampler(num_samples=2)) + >>> sampler.add_child(ds.RandomSampler(num_samples=4)) >>> dataset = ds.Cifar10Dataset(cifar10_dataset_dir, sampler=sampler) """ + if self.child_sampler is not None: + raise RuntimeError("Cannot add child sampler, this sampler already has a child.") self.child_sampler = sampler def get_child(self): diff --git a/tests/ut/cpp/dataset/c_api_samplers_test.cc b/tests/ut/cpp/dataset/c_api_samplers_test.cc index f6dbe265fdc..893440016dd 100644 --- a/tests/ut/cpp/dataset/c_api_samplers_test.cc +++ b/tests/ut/cpp/dataset/c_api_samplers_test.cc @@ -93,7 +93,7 @@ TEST_F(MindDataTestPipeline, TestImageFolderWithSamplers) { // iterate through dataset and count rows // Expectation: There should be 12 rows TEST_F(MindDataTestPipeline, TestWeightedRandomSamplerImageFolder) { - std::vector weights = {0.9, 0.8, 0.68, 0.7, 0.71, 0.6, 0.5, 0.4, 0.3, 0.5, 0.2, 0.1}; + std::vector weights = {0.9, 0.8, 0.68, 0.7, 0.71, 0.6, 0.5, 0.4, 0.3, 0.5, 0.2, 0.1}; std::shared_ptr sampl = std::make_shared(weights, 12); EXPECT_NE(sampl, nullptr); @@ -272,7 +272,7 @@ TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess4) { } // Feature: Test ImageFolder with DistributedSampler -// Description: Create ImageFolder dataset with DistributedSampler given num_shards=11 and shard_id=10, +// Description: Create ImageFolder dataset with DistributedSampler given num_shards=11 and shard_id=10, // count rows in dataset // Expectation: There should be 4 rows (44 rows in original data/11 = 4) TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess5) { @@ -306,7 +306,7 @@ TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess5) { } // Feature: Test ImageFolder with DistributedSampler -// Description: Create ImageFolder dataset with DistributedSampler given num_shards=4 and shard_id=3, +// Description: Create ImageFolder dataset with DistributedSampler given num_shards=4 and shard_id=3, // count rows in dataset // Expectation: There should be 11 rows (44 rows in original data/4 = 11) TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess6) { @@ -431,6 +431,168 @@ TEST_F(MindDataTestPipeline, TestSamplerAddChild) { iter->Stop(); } +/// Feature: MindData Sampler Support +/// Description: Test MindData Sampler AddChild with nested children +/// Expectation: Result dataset has expected number of samples. +TEST_F(MindDataTestPipeline, TestSamplerAddChild2) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild2."; + + // num_samples of parent sampler > num_sampler of child sampler, namely 5 > 2, num_shards is 2 to output dataset with + // 1 sampler + auto sampler = std::make_shared(2, 0, false, 5, 0, -1, true); + EXPECT_NE(sampler, nullptr); + + // num_samples of parent sampler > num_samples of child sampler, namely 4 > 2 + auto child_sampler = std::make_shared(true, 4); + EXPECT_NE(child_sampler, nullptr); + auto child_sampler2 = std::make_shared(0, 2); + EXPECT_NE(child_sampler2, nullptr); + + child_sampler->AddChild(child_sampler2); + sampler->AddChild(child_sampler); + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, false, sampler); + EXPECT_NE(ds, nullptr); + + // Iterate the dataset and get each row + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + std::unordered_map row; + ASSERT_OK(iter->GetNextRow(&row)); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + ASSERT_OK(iter->GetNextRow(&row)); + } + EXPECT_EQ(i, 1); + + EXPECT_EQ(ds->GetDatasetSize(), 1); + iter->Stop(); +} + +/// Feature: MindData Sampler Support +/// Description: Test MindData Sampler AddChild with num_samples of parent sampler > num_samples of child sampler +/// Expectation: Result dataset has expected number of samples. +TEST_F(MindDataTestPipeline, TestSamplerAddChild3) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild3."; + + // num_samples of parent sampler > num_samples of child sampler, namely 5 > 4 + std::vector weights = {1.0, 0.1, 0.02, 0.3}; + auto sampler = std::make_shared(weights, 5); + EXPECT_NE(sampler, nullptr); + + auto child_sampler = std::make_shared(0, 4); + EXPECT_NE(child_sampler, nullptr); + + sampler->AddChild(child_sampler); + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, false, sampler); + EXPECT_NE(ds, nullptr); + + // Iterate the dataset and get each row + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + std::unordered_map row; + ASSERT_OK(iter->GetNextRow(&row)); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + ASSERT_OK(iter->GetNextRow(&row)); + } + EXPECT_EQ(i, 4); + + EXPECT_EQ(ds->GetDatasetSize(), 4); + iter->Stop(); +} + +/// Feature: MindData Sampler Support +/// Description: Test MindData Sampler AddChild with num_samples of parent sampler < num_samples of child sampler +/// Expectation: Result dataset has expected number of samples. +TEST_F(MindDataTestPipeline, TestSamplerAddChild4) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild4."; + + // num_samples of parent sampler < num_samples of child sampler, namely 5 < 7 + auto sampler = std::make_shared(1, 0, false, 5, 0, -1, true); + EXPECT_NE(sampler, nullptr); + + auto child_sampler = std::make_shared(3, true, 7); + EXPECT_NE(child_sampler, nullptr); + + sampler->AddChild(child_sampler); + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, false, sampler); + EXPECT_NE(ds, nullptr); + + // Iterate the dataset and get each row + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + std::unordered_map row; + ASSERT_OK(iter->GetNextRow(&row)); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + ASSERT_OK(iter->GetNextRow(&row)); + } + EXPECT_EQ(i, 5); + + EXPECT_EQ(ds->GetDatasetSize(), 5); + iter->Stop(); +} + +/// Feature: MindData Sampler Support +/// Description: Test MindData Sampler AddChild with several children +/// Expectation: Result dataset has expected number of samples, and output error messages for more than 1 child. +TEST_F(MindDataTestPipeline, TestSamplerAddChild5) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild5."; + + // Use all samples (num_sampler=0) for parent DistributedSampler + auto sampler = std::make_shared(1, 0, false, 0, 0, -1, true); + EXPECT_NE(sampler, nullptr); + + auto child_sampler1 = std::make_shared(0, 10); + EXPECT_NE(child_sampler1, nullptr); + sampler->AddChild(child_sampler1); + + // Attempt to add more than one child_sampler is expected to fail + auto child_sampler2 = std::make_shared(0, 6); + EXPECT_NE(child_sampler2, nullptr); + sampler->AddChild(child_sampler2); + + auto child_sampler3 = std::make_shared(0, 7); + EXPECT_NE(child_sampler3, nullptr); + sampler->AddChild(child_sampler3); + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, false, sampler); + EXPECT_NE(ds, nullptr); + + // Iterate the dataset and get each row + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + std::unordered_map row; + ASSERT_OK(iter->GetNextRow(&row)); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + ASSERT_OK(iter->GetNextRow(&row)); + } + EXPECT_EQ(i, 10); + + EXPECT_EQ(ds->GetDatasetSize(), 10); + iter->Stop(); +} + TEST_F(MindDataTestPipeline, TestSubsetSamplerSuccess1) { MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSubsetSamplerSuccess1."; // Test basic setting of subset_sampler with default num_samples @@ -553,7 +715,6 @@ TEST_F(MindDataTestPipeline, TestSubsetSamplerFail) { TEST_F(MindDataTestPipeline, TestPKSamplerImageFolder) { MS_LOG(INFO) << "Doing MindDataTestPipeline-TestPKSamplerImageFolder."; - std::shared_ptr sampler = std::make_shared(3, false); EXPECT_NE(sampler, nullptr); diff --git a/tests/ut/python/dataset/test_sampler_chain.py b/tests/ut/python/dataset/test_sampler_chain.py index 93359c76d5d..2a59acd88d7 100644 --- a/tests/ut/python/dataset/test_sampler_chain.py +++ b/tests/ut/python/dataset/test_sampler_chain.py @@ -136,23 +136,22 @@ def test_numpyslices_sampler_chain_multi_add_child(): """ Feature: Chained Sampler Description: NumpySlicesDataset with sampler chain with multiple add_child() invocations - Expectation: Data verified to be correct. Only last add_child() invocation is effective. + Expectation: Data verified to be correct. A subsequent add_child() invocation replaces the prior + child sampler (if any). """ logger.info("test_numpyslices_sampler_chain_multi_add_child") # Create NumpySlicesDataset with sampler chain # Call add_child() multiple times in succession - # Note: A subsequent add_child() invocation replaces the prior child sampler (if any). np_data = [1, 2, 3, 4, 5, 6, 7, 8] sampler = ds.SequentialSampler(start_index=1, num_samples=None) - # 1st add_child invocation - sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=1)) - # 2nd add_child invocation - sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=2)) - # 3rd add_child invocation - sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=3)) - # 4th and last add_child invocation which is the effective child sampler sampler.add_child(ds.SequentialSampler(start_index=1, num_samples=6)) + # Expect the second child will fail + with pytest.raises(RuntimeError) as info: + sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=2)) + + error_msg = "Cannot add child sampler, this sampler already has a child." + assert error_msg in str(info.value) data1 = ds.NumpySlicesDataset(np_data, sampler=sampler) @@ -281,13 +280,14 @@ def test_coco_sampler_chain(): def test_cifar_sampler_chain(): """ - Test Cifar sampler chain + Test Cifar sampler chain, including nested child sampler """ logger.info("test_cifar_sampler_chain") sampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5) child_sampler = ds.RandomSampler(replacement=True, num_samples=4) child_sampler2 = ds.SequentialSampler(start_index=0, num_samples=2) + # Note: Add nested child_sampler2 to child_sampler child_sampler.add_child(child_sampler2) sampler.add_child(child_sampler) data1 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, sampler=sampler)