!29358 MD Child Sampler Code and UT Issues

Merge pull request !29358 from zetongzhao/sampler_AddChild
This commit is contained in:
i-robot 2022-01-31 16:55:22 +00:00 committed by Gitee
commit e12d6af257
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
5 changed files with 237 additions and 25 deletions

View File

@ -29,6 +29,14 @@
namespace mindspore {
namespace dataset {
Status Sampler::BuildChildren(std::shared_ptr<SamplerObj> *const sampler) const {
for (const auto &child : children_) {
std::shared_ptr<SamplerObj> sampler_obj = child->Parse();
RETURN_IF_NOT_OK((*sampler)->AddChildSampler(sampler_obj));
}
return Status::OK();
}
// DistributedSampler
DistributedSampler::DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples,
uint32_t seed, int64_t offset, bool even_dist)
@ -41,8 +49,13 @@ DistributedSampler::DistributedSampler(int64_t num_shards, int64_t shard_id, boo
even_dist_(even_dist) {}
std::shared_ptr<SamplerObj> DistributedSampler::Parse() const {
return std::make_shared<DistributedSamplerObj>(num_shards_, shard_id_, shuffle_, num_samples_, seed_, offset_,
even_dist_);
std::shared_ptr<SamplerObj> output =
std::make_shared<DistributedSamplerObj>(num_shards_, shard_id_, shuffle_, num_samples_, seed_, offset_, even_dist_);
Status s = BuildChildren(&output);
if (s.IsError()) {
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
}
return output;
}
// PKSampler
@ -50,7 +63,12 @@ PKSampler::PKSampler(int64_t num_val, bool shuffle, int64_t num_samples)
: num_val_(num_val), shuffle_(shuffle), num_samples_(num_samples) {}
std::shared_ptr<SamplerObj> PKSampler::Parse() const {
return std::make_shared<PKSamplerObj>(num_val_, shuffle_, num_samples_);
std::shared_ptr<SamplerObj> output = std::make_shared<PKSamplerObj>(num_val_, shuffle_, num_samples_);
Status s = BuildChildren(&output);
if (s.IsError()) {
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
}
return output;
}
// RandomSampler
@ -58,7 +76,12 @@ RandomSampler::RandomSampler(bool replacement, int64_t num_samples)
: replacement_(replacement), num_samples_(num_samples) {}
std::shared_ptr<SamplerObj> RandomSampler::Parse() const {
return std::make_shared<RandomSamplerObj>(replacement_, num_samples_);
std::shared_ptr<SamplerObj> output = std::make_shared<RandomSamplerObj>(replacement_, num_samples_);
Status s = BuildChildren(&output);
if (s.IsError()) {
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
}
return output;
}
// SequentialSampler
@ -66,7 +89,12 @@ SequentialSampler::SequentialSampler(int64_t start_index, int64_t num_samples)
: start_index_(start_index), num_samples_(num_samples) {}
std::shared_ptr<SamplerObj> SequentialSampler::Parse() const {
return std::make_shared<SequentialSamplerObj>(start_index_, num_samples_);
std::shared_ptr<SamplerObj> output = std::make_shared<SequentialSamplerObj>(start_index_, num_samples_);
Status s = BuildChildren(&output);
if (s.IsError()) {
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
}
return output;
}
// SubsetSampler
@ -74,7 +102,12 @@ SubsetSampler::SubsetSampler(const std::vector<int64_t> &indices, int64_t num_sa
: indices_(indices), num_samples_(num_samples) {}
std::shared_ptr<SamplerObj> SubsetSampler::Parse() const {
return std::make_shared<SubsetSamplerObj>(indices_, num_samples_);
std::shared_ptr<SamplerObj> output = std::make_shared<SubsetSamplerObj>(indices_, num_samples_);
Status s = BuildChildren(&output);
if (s.IsError()) {
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
}
return output;
}
// SubsetRandomSampler
@ -82,7 +115,12 @@ SubsetRandomSampler::SubsetRandomSampler(const std::vector<int64_t> &indices, in
: SubsetSampler(indices, num_samples) {}
std::shared_ptr<SamplerObj> SubsetRandomSampler::Parse() const {
return std::make_shared<SubsetRandomSamplerObj>(indices_, num_samples_);
std::shared_ptr<SamplerObj> output = std::make_shared<SubsetRandomSamplerObj>(indices_, num_samples_);
Status s = BuildChildren(&output);
if (s.IsError()) {
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
}
return output;
}
// WeightedRandomSampler
@ -90,7 +128,12 @@ WeightedRandomSampler::WeightedRandomSampler(const std::vector<double> &weights,
: weights_(weights), num_samples_(num_samples), replacement_(replacement) {}
std::shared_ptr<SamplerObj> WeightedRandomSampler::Parse() const {
return std::make_shared<WeightedRandomSamplerObj>(weights_, num_samples_, replacement_);
std::shared_ptr<SamplerObj> output = std::make_shared<WeightedRandomSamplerObj>(weights_, num_samples_, replacement_);
Status s = BuildChildren(&output);
if (s.IsError()) {
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
}
return output;
}
} // namespace dataset
} // namespace mindspore

View File

@ -21,6 +21,7 @@
#include <vector>
#include "include/api/types.h"
#include "include/api/status.h"
namespace mindspore {
namespace dataset {
@ -88,6 +89,11 @@ class MS_API Sampler : std::enable_shared_from_this<Sampler> {
/// \return shared pointer to the newly created TensorOperation.
virtual std::shared_ptr<SamplerObj> Parse() const = 0;
/// \brief A function that calls Parse() on the children of this sampler
/// \param[in] sampler The samplerIR object built from this sampler
/// \return the Status code returned
Status BuildChildren(std::shared_ptr<SamplerObj> *const sampler) const;
std::vector<std::shared_ptr<Sampler>> children_;
};

View File

@ -105,8 +105,8 @@ class BuiltinSampler:
def add_child(self, sampler):
"""
Add a sub-sampler for given sampler. The sub-sampler will receive all data from the
output of parent sampler and apply its sample logic to return new samples.
Add a sub-sampler for given sampler. The parent will receive all data from the
output of sub-sampler sampler and apply its sample logic to return new samples.
Args:
sampler (Sampler): Object used to choose samples from the dataset. Only builtin
@ -115,9 +115,11 @@ class BuiltinSampler:
Examples:
>>> sampler = ds.SequentialSampler(start_index=0, num_samples=3)
>>> sampler.add_child(ds.RandomSampler(num_samples=2))
>>> sampler.add_child(ds.RandomSampler(num_samples=4))
>>> dataset = ds.Cifar10Dataset(cifar10_dataset_dir, sampler=sampler)
"""
if self.child_sampler is not None:
raise RuntimeError("Cannot add child sampler, this sampler already has a child.")
self.child_sampler = sampler
def get_child(self):

View File

@ -93,7 +93,7 @@ TEST_F(MindDataTestPipeline, TestImageFolderWithSamplers) {
// iterate through dataset and count rows
// Expectation: There should be 12 rows
TEST_F(MindDataTestPipeline, TestWeightedRandomSamplerImageFolder) {
std::vector<double> weights = {0.9, 0.8, 0.68, 0.7, 0.71, 0.6, 0.5, 0.4, 0.3, 0.5, 0.2, 0.1};
std::vector<double> weights = {0.9, 0.8, 0.68, 0.7, 0.71, 0.6, 0.5, 0.4, 0.3, 0.5, 0.2, 0.1};
std::shared_ptr<Sampler> sampl = std::make_shared<WeightedRandomSampler>(weights, 12);
EXPECT_NE(sampl, nullptr);
@ -272,7 +272,7 @@ TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess4) {
}
// Feature: Test ImageFolder with DistributedSampler
// Description: Create ImageFolder dataset with DistributedSampler given num_shards=11 and shard_id=10,
// Description: Create ImageFolder dataset with DistributedSampler given num_shards=11 and shard_id=10,
// count rows in dataset
// Expectation: There should be 4 rows (44 rows in original data/11 = 4)
TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess5) {
@ -306,7 +306,7 @@ TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess5) {
}
// Feature: Test ImageFolder with DistributedSampler
// Description: Create ImageFolder dataset with DistributedSampler given num_shards=4 and shard_id=3,
// Description: Create ImageFolder dataset with DistributedSampler given num_shards=4 and shard_id=3,
// count rows in dataset
// Expectation: There should be 11 rows (44 rows in original data/4 = 11)
TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess6) {
@ -431,6 +431,168 @@ TEST_F(MindDataTestPipeline, TestSamplerAddChild) {
iter->Stop();
}
/// Feature: MindData Sampler Support
/// Description: Test MindData Sampler AddChild with nested children
/// Expectation: Result dataset has expected number of samples.
TEST_F(MindDataTestPipeline, TestSamplerAddChild2) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild2.";
// num_samples of parent sampler > num_sampler of child sampler, namely 5 > 2, num_shards is 2 to output dataset with
// 1 sampler
auto sampler = std::make_shared<DistributedSampler>(2, 0, false, 5, 0, -1, true);
EXPECT_NE(sampler, nullptr);
// num_samples of parent sampler > num_samples of child sampler, namely 4 > 2
auto child_sampler = std::make_shared<RandomSampler>(true, 4);
EXPECT_NE(child_sampler, nullptr);
auto child_sampler2 = std::make_shared<SequentialSampler>(0, 2);
EXPECT_NE(child_sampler2, nullptr);
child_sampler->AddChild(child_sampler2);
sampler->AddChild(child_sampler);
// Create an ImageFolder Dataset
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, false, sampler);
EXPECT_NE(ds, nullptr);
// Iterate the dataset and get each row
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
while (row.size() != 0) {
i++;
ASSERT_OK(iter->GetNextRow(&row));
}
EXPECT_EQ(i, 1);
EXPECT_EQ(ds->GetDatasetSize(), 1);
iter->Stop();
}
/// Feature: MindData Sampler Support
/// Description: Test MindData Sampler AddChild with num_samples of parent sampler > num_samples of child sampler
/// Expectation: Result dataset has expected number of samples.
TEST_F(MindDataTestPipeline, TestSamplerAddChild3) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild3.";
// num_samples of parent sampler > num_samples of child sampler, namely 5 > 4
std::vector<double> weights = {1.0, 0.1, 0.02, 0.3};
auto sampler = std::make_shared<WeightedRandomSampler>(weights, 5);
EXPECT_NE(sampler, nullptr);
auto child_sampler = std::make_shared<SequentialSampler>(0, 4);
EXPECT_NE(child_sampler, nullptr);
sampler->AddChild(child_sampler);
// Create an ImageFolder Dataset
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, false, sampler);
EXPECT_NE(ds, nullptr);
// Iterate the dataset and get each row
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
while (row.size() != 0) {
i++;
ASSERT_OK(iter->GetNextRow(&row));
}
EXPECT_EQ(i, 4);
EXPECT_EQ(ds->GetDatasetSize(), 4);
iter->Stop();
}
/// Feature: MindData Sampler Support
/// Description: Test MindData Sampler AddChild with num_samples of parent sampler < num_samples of child sampler
/// Expectation: Result dataset has expected number of samples.
TEST_F(MindDataTestPipeline, TestSamplerAddChild4) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild4.";
// num_samples of parent sampler < num_samples of child sampler, namely 5 < 7
auto sampler = std::make_shared<DistributedSampler>(1, 0, false, 5, 0, -1, true);
EXPECT_NE(sampler, nullptr);
auto child_sampler = std::make_shared<PKSampler>(3, true, 7);
EXPECT_NE(child_sampler, nullptr);
sampler->AddChild(child_sampler);
// Create an ImageFolder Dataset
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, false, sampler);
EXPECT_NE(ds, nullptr);
// Iterate the dataset and get each row
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
while (row.size() != 0) {
i++;
ASSERT_OK(iter->GetNextRow(&row));
}
EXPECT_EQ(i, 5);
EXPECT_EQ(ds->GetDatasetSize(), 5);
iter->Stop();
}
/// Feature: MindData Sampler Support
/// Description: Test MindData Sampler AddChild with several children
/// Expectation: Result dataset has expected number of samples, and output error messages for more than 1 child.
TEST_F(MindDataTestPipeline, TestSamplerAddChild5) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild5.";
// Use all samples (num_sampler=0) for parent DistributedSampler
auto sampler = std::make_shared<DistributedSampler>(1, 0, false, 0, 0, -1, true);
EXPECT_NE(sampler, nullptr);
auto child_sampler1 = std::make_shared<SequentialSampler>(0, 10);
EXPECT_NE(child_sampler1, nullptr);
sampler->AddChild(child_sampler1);
// Attempt to add more than one child_sampler is expected to fail
auto child_sampler2 = std::make_shared<SequentialSampler>(0, 6);
EXPECT_NE(child_sampler2, nullptr);
sampler->AddChild(child_sampler2);
auto child_sampler3 = std::make_shared<SequentialSampler>(0, 7);
EXPECT_NE(child_sampler3, nullptr);
sampler->AddChild(child_sampler3);
// Create an ImageFolder Dataset
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, false, sampler);
EXPECT_NE(ds, nullptr);
// Iterate the dataset and get each row
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
while (row.size() != 0) {
i++;
ASSERT_OK(iter->GetNextRow(&row));
}
EXPECT_EQ(i, 10);
EXPECT_EQ(ds->GetDatasetSize(), 10);
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestSubsetSamplerSuccess1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSubsetSamplerSuccess1.";
// Test basic setting of subset_sampler with default num_samples
@ -553,7 +715,6 @@ TEST_F(MindDataTestPipeline, TestSubsetSamplerFail) {
TEST_F(MindDataTestPipeline, TestPKSamplerImageFolder) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestPKSamplerImageFolder.";
std::shared_ptr<Sampler> sampler = std::make_shared<PKSampler>(3, false);
EXPECT_NE(sampler, nullptr);

View File

@ -136,23 +136,22 @@ def test_numpyslices_sampler_chain_multi_add_child():
"""
Feature: Chained Sampler
Description: NumpySlicesDataset with sampler chain with multiple add_child() invocations
Expectation: Data verified to be correct. Only last add_child() invocation is effective.
Expectation: Data verified to be correct. A subsequent add_child() invocation replaces the prior
child sampler (if any).
"""
logger.info("test_numpyslices_sampler_chain_multi_add_child")
# Create NumpySlicesDataset with sampler chain
# Call add_child() multiple times in succession
# Note: A subsequent add_child() invocation replaces the prior child sampler (if any).
np_data = [1, 2, 3, 4, 5, 6, 7, 8]
sampler = ds.SequentialSampler(start_index=1, num_samples=None)
# 1st add_child invocation
sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=1))
# 2nd add_child invocation
sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=2))
# 3rd add_child invocation
sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=3))
# 4th and last add_child invocation which is the effective child sampler
sampler.add_child(ds.SequentialSampler(start_index=1, num_samples=6))
# Expect the second child will fail
with pytest.raises(RuntimeError) as info:
sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=2))
error_msg = "Cannot add child sampler, this sampler already has a child."
assert error_msg in str(info.value)
data1 = ds.NumpySlicesDataset(np_data, sampler=sampler)
@ -281,13 +280,14 @@ def test_coco_sampler_chain():
def test_cifar_sampler_chain():
"""
Test Cifar sampler chain
Test Cifar sampler chain, including nested child sampler
"""
logger.info("test_cifar_sampler_chain")
sampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5)
child_sampler = ds.RandomSampler(replacement=True, num_samples=4)
child_sampler2 = ds.SequentialSampler(start_index=0, num_samples=2)
# Note: Add nested child_sampler2 to child_sampler
child_sampler.add_child(child_sampler2)
sampler.add_child(child_sampler)
data1 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, sampler=sampler)