forked from mindspore-Ecosystem/mindspore
!29358 MD Child Sampler Code and UT Issues
Merge pull request !29358 from zetongzhao/sampler_AddChild
This commit is contained in:
commit
e12d6af257
|
@ -29,6 +29,14 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status Sampler::BuildChildren(std::shared_ptr<SamplerObj> *const sampler) const {
|
||||
for (const auto &child : children_) {
|
||||
std::shared_ptr<SamplerObj> sampler_obj = child->Parse();
|
||||
RETURN_IF_NOT_OK((*sampler)->AddChildSampler(sampler_obj));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// DistributedSampler
|
||||
DistributedSampler::DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples,
|
||||
uint32_t seed, int64_t offset, bool even_dist)
|
||||
|
@ -41,8 +49,13 @@ DistributedSampler::DistributedSampler(int64_t num_shards, int64_t shard_id, boo
|
|||
even_dist_(even_dist) {}
|
||||
|
||||
std::shared_ptr<SamplerObj> DistributedSampler::Parse() const {
|
||||
return std::make_shared<DistributedSamplerObj>(num_shards_, shard_id_, shuffle_, num_samples_, seed_, offset_,
|
||||
even_dist_);
|
||||
std::shared_ptr<SamplerObj> output =
|
||||
std::make_shared<DistributedSamplerObj>(num_shards_, shard_id_, shuffle_, num_samples_, seed_, offset_, even_dist_);
|
||||
Status s = BuildChildren(&output);
|
||||
if (s.IsError()) {
|
||||
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
// PKSampler
|
||||
|
@ -50,7 +63,12 @@ PKSampler::PKSampler(int64_t num_val, bool shuffle, int64_t num_samples)
|
|||
: num_val_(num_val), shuffle_(shuffle), num_samples_(num_samples) {}
|
||||
|
||||
std::shared_ptr<SamplerObj> PKSampler::Parse() const {
|
||||
return std::make_shared<PKSamplerObj>(num_val_, shuffle_, num_samples_);
|
||||
std::shared_ptr<SamplerObj> output = std::make_shared<PKSamplerObj>(num_val_, shuffle_, num_samples_);
|
||||
Status s = BuildChildren(&output);
|
||||
if (s.IsError()) {
|
||||
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
// RandomSampler
|
||||
|
@ -58,7 +76,12 @@ RandomSampler::RandomSampler(bool replacement, int64_t num_samples)
|
|||
: replacement_(replacement), num_samples_(num_samples) {}
|
||||
|
||||
std::shared_ptr<SamplerObj> RandomSampler::Parse() const {
|
||||
return std::make_shared<RandomSamplerObj>(replacement_, num_samples_);
|
||||
std::shared_ptr<SamplerObj> output = std::make_shared<RandomSamplerObj>(replacement_, num_samples_);
|
||||
Status s = BuildChildren(&output);
|
||||
if (s.IsError()) {
|
||||
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
// SequentialSampler
|
||||
|
@ -66,7 +89,12 @@ SequentialSampler::SequentialSampler(int64_t start_index, int64_t num_samples)
|
|||
: start_index_(start_index), num_samples_(num_samples) {}
|
||||
|
||||
std::shared_ptr<SamplerObj> SequentialSampler::Parse() const {
|
||||
return std::make_shared<SequentialSamplerObj>(start_index_, num_samples_);
|
||||
std::shared_ptr<SamplerObj> output = std::make_shared<SequentialSamplerObj>(start_index_, num_samples_);
|
||||
Status s = BuildChildren(&output);
|
||||
if (s.IsError()) {
|
||||
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
// SubsetSampler
|
||||
|
@ -74,7 +102,12 @@ SubsetSampler::SubsetSampler(const std::vector<int64_t> &indices, int64_t num_sa
|
|||
: indices_(indices), num_samples_(num_samples) {}
|
||||
|
||||
std::shared_ptr<SamplerObj> SubsetSampler::Parse() const {
|
||||
return std::make_shared<SubsetSamplerObj>(indices_, num_samples_);
|
||||
std::shared_ptr<SamplerObj> output = std::make_shared<SubsetSamplerObj>(indices_, num_samples_);
|
||||
Status s = BuildChildren(&output);
|
||||
if (s.IsError()) {
|
||||
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
// SubsetRandomSampler
|
||||
|
@ -82,7 +115,12 @@ SubsetRandomSampler::SubsetRandomSampler(const std::vector<int64_t> &indices, in
|
|||
: SubsetSampler(indices, num_samples) {}
|
||||
|
||||
std::shared_ptr<SamplerObj> SubsetRandomSampler::Parse() const {
|
||||
return std::make_shared<SubsetRandomSamplerObj>(indices_, num_samples_);
|
||||
std::shared_ptr<SamplerObj> output = std::make_shared<SubsetRandomSamplerObj>(indices_, num_samples_);
|
||||
Status s = BuildChildren(&output);
|
||||
if (s.IsError()) {
|
||||
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
// WeightedRandomSampler
|
||||
|
@ -90,7 +128,12 @@ WeightedRandomSampler::WeightedRandomSampler(const std::vector<double> &weights,
|
|||
: weights_(weights), num_samples_(num_samples), replacement_(replacement) {}
|
||||
|
||||
std::shared_ptr<SamplerObj> WeightedRandomSampler::Parse() const {
|
||||
return std::make_shared<WeightedRandomSamplerObj>(weights_, num_samples_, replacement_);
|
||||
std::shared_ptr<SamplerObj> output = std::make_shared<WeightedRandomSamplerObj>(weights_, num_samples_, replacement_);
|
||||
Status s = BuildChildren(&output);
|
||||
if (s.IsError()) {
|
||||
MS_LOG(ERROR) << "[Internal ERROR] Error in Parse. Message: " << s;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "include/api/types.h"
|
||||
#include "include/api/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
@ -88,6 +89,11 @@ class MS_API Sampler : std::enable_shared_from_this<Sampler> {
|
|||
/// \return shared pointer to the newly created TensorOperation.
|
||||
virtual std::shared_ptr<SamplerObj> Parse() const = 0;
|
||||
|
||||
/// \brief A function that calls Parse() on the children of this sampler
|
||||
/// \param[in] sampler The samplerIR object built from this sampler
|
||||
/// \return the Status code returned
|
||||
Status BuildChildren(std::shared_ptr<SamplerObj> *const sampler) const;
|
||||
|
||||
std::vector<std::shared_ptr<Sampler>> children_;
|
||||
};
|
||||
|
||||
|
|
|
@ -105,8 +105,8 @@ class BuiltinSampler:
|
|||
|
||||
def add_child(self, sampler):
|
||||
"""
|
||||
Add a sub-sampler for given sampler. The sub-sampler will receive all data from the
|
||||
output of parent sampler and apply its sample logic to return new samples.
|
||||
Add a sub-sampler for given sampler. The parent will receive all data from the
|
||||
output of sub-sampler sampler and apply its sample logic to return new samples.
|
||||
|
||||
Args:
|
||||
sampler (Sampler): Object used to choose samples from the dataset. Only builtin
|
||||
|
@ -115,9 +115,11 @@ class BuiltinSampler:
|
|||
|
||||
Examples:
|
||||
>>> sampler = ds.SequentialSampler(start_index=0, num_samples=3)
|
||||
>>> sampler.add_child(ds.RandomSampler(num_samples=2))
|
||||
>>> sampler.add_child(ds.RandomSampler(num_samples=4))
|
||||
>>> dataset = ds.Cifar10Dataset(cifar10_dataset_dir, sampler=sampler)
|
||||
"""
|
||||
if self.child_sampler is not None:
|
||||
raise RuntimeError("Cannot add child sampler, this sampler already has a child.")
|
||||
self.child_sampler = sampler
|
||||
|
||||
def get_child(self):
|
||||
|
|
|
@ -93,7 +93,7 @@ TEST_F(MindDataTestPipeline, TestImageFolderWithSamplers) {
|
|||
// iterate through dataset and count rows
|
||||
// Expectation: There should be 12 rows
|
||||
TEST_F(MindDataTestPipeline, TestWeightedRandomSamplerImageFolder) {
|
||||
std::vector<double> weights = {0.9, 0.8, 0.68, 0.7, 0.71, 0.6, 0.5, 0.4, 0.3, 0.5, 0.2, 0.1};
|
||||
std::vector<double> weights = {0.9, 0.8, 0.68, 0.7, 0.71, 0.6, 0.5, 0.4, 0.3, 0.5, 0.2, 0.1};
|
||||
std::shared_ptr<Sampler> sampl = std::make_shared<WeightedRandomSampler>(weights, 12);
|
||||
EXPECT_NE(sampl, nullptr);
|
||||
|
||||
|
@ -272,7 +272,7 @@ TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess4) {
|
|||
}
|
||||
|
||||
// Feature: Test ImageFolder with DistributedSampler
|
||||
// Description: Create ImageFolder dataset with DistributedSampler given num_shards=11 and shard_id=10,
|
||||
// Description: Create ImageFolder dataset with DistributedSampler given num_shards=11 and shard_id=10,
|
||||
// count rows in dataset
|
||||
// Expectation: There should be 4 rows (44 rows in original data/11 = 4)
|
||||
TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess5) {
|
||||
|
@ -306,7 +306,7 @@ TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess5) {
|
|||
}
|
||||
|
||||
// Feature: Test ImageFolder with DistributedSampler
|
||||
// Description: Create ImageFolder dataset with DistributedSampler given num_shards=4 and shard_id=3,
|
||||
// Description: Create ImageFolder dataset with DistributedSampler given num_shards=4 and shard_id=3,
|
||||
// count rows in dataset
|
||||
// Expectation: There should be 11 rows (44 rows in original data/4 = 11)
|
||||
TEST_F(MindDataTestPipeline, TestDistributedSamplerSuccess6) {
|
||||
|
@ -431,6 +431,168 @@ TEST_F(MindDataTestPipeline, TestSamplerAddChild) {
|
|||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: MindData Sampler Support
|
||||
/// Description: Test MindData Sampler AddChild with nested children
|
||||
/// Expectation: Result dataset has expected number of samples.
|
||||
TEST_F(MindDataTestPipeline, TestSamplerAddChild2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild2.";
|
||||
|
||||
// num_samples of parent sampler > num_sampler of child sampler, namely 5 > 2, num_shards is 2 to output dataset with
|
||||
// 1 sampler
|
||||
auto sampler = std::make_shared<DistributedSampler>(2, 0, false, 5, 0, -1, true);
|
||||
EXPECT_NE(sampler, nullptr);
|
||||
|
||||
// num_samples of parent sampler > num_samples of child sampler, namely 4 > 2
|
||||
auto child_sampler = std::make_shared<RandomSampler>(true, 4);
|
||||
EXPECT_NE(child_sampler, nullptr);
|
||||
auto child_sampler2 = std::make_shared<SequentialSampler>(0, 2);
|
||||
EXPECT_NE(child_sampler2, nullptr);
|
||||
|
||||
child_sampler->AddChild(child_sampler2);
|
||||
sampler->AddChild(child_sampler);
|
||||
|
||||
// Create an ImageFolder Dataset
|
||||
std::string folder_path = datasets_root_path_ + "/testPK/data/";
|
||||
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, false, sampler);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
}
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
EXPECT_EQ(ds->GetDatasetSize(), 1);
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: MindData Sampler Support
|
||||
/// Description: Test MindData Sampler AddChild with num_samples of parent sampler > num_samples of child sampler
|
||||
/// Expectation: Result dataset has expected number of samples.
|
||||
TEST_F(MindDataTestPipeline, TestSamplerAddChild3) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild3.";
|
||||
|
||||
// num_samples of parent sampler > num_samples of child sampler, namely 5 > 4
|
||||
std::vector<double> weights = {1.0, 0.1, 0.02, 0.3};
|
||||
auto sampler = std::make_shared<WeightedRandomSampler>(weights, 5);
|
||||
EXPECT_NE(sampler, nullptr);
|
||||
|
||||
auto child_sampler = std::make_shared<SequentialSampler>(0, 4);
|
||||
EXPECT_NE(child_sampler, nullptr);
|
||||
|
||||
sampler->AddChild(child_sampler);
|
||||
|
||||
// Create an ImageFolder Dataset
|
||||
std::string folder_path = datasets_root_path_ + "/testPK/data/";
|
||||
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, false, sampler);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
}
|
||||
EXPECT_EQ(i, 4);
|
||||
|
||||
EXPECT_EQ(ds->GetDatasetSize(), 4);
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: MindData Sampler Support
|
||||
/// Description: Test MindData Sampler AddChild with num_samples of parent sampler < num_samples of child sampler
|
||||
/// Expectation: Result dataset has expected number of samples.
|
||||
TEST_F(MindDataTestPipeline, TestSamplerAddChild4) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild4.";
|
||||
|
||||
// num_samples of parent sampler < num_samples of child sampler, namely 5 < 7
|
||||
auto sampler = std::make_shared<DistributedSampler>(1, 0, false, 5, 0, -1, true);
|
||||
EXPECT_NE(sampler, nullptr);
|
||||
|
||||
auto child_sampler = std::make_shared<PKSampler>(3, true, 7);
|
||||
EXPECT_NE(child_sampler, nullptr);
|
||||
|
||||
sampler->AddChild(child_sampler);
|
||||
|
||||
// Create an ImageFolder Dataset
|
||||
std::string folder_path = datasets_root_path_ + "/testPK/data/";
|
||||
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, false, sampler);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
}
|
||||
EXPECT_EQ(i, 5);
|
||||
|
||||
EXPECT_EQ(ds->GetDatasetSize(), 5);
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: MindData Sampler Support
|
||||
/// Description: Test MindData Sampler AddChild with several children
|
||||
/// Expectation: Result dataset has expected number of samples, and output error messages for more than 1 child.
|
||||
TEST_F(MindDataTestPipeline, TestSamplerAddChild5) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSamplerAddChild5.";
|
||||
|
||||
// Use all samples (num_sampler=0) for parent DistributedSampler
|
||||
auto sampler = std::make_shared<DistributedSampler>(1, 0, false, 0, 0, -1, true);
|
||||
EXPECT_NE(sampler, nullptr);
|
||||
|
||||
auto child_sampler1 = std::make_shared<SequentialSampler>(0, 10);
|
||||
EXPECT_NE(child_sampler1, nullptr);
|
||||
sampler->AddChild(child_sampler1);
|
||||
|
||||
// Attempt to add more than one child_sampler is expected to fail
|
||||
auto child_sampler2 = std::make_shared<SequentialSampler>(0, 6);
|
||||
EXPECT_NE(child_sampler2, nullptr);
|
||||
sampler->AddChild(child_sampler2);
|
||||
|
||||
auto child_sampler3 = std::make_shared<SequentialSampler>(0, 7);
|
||||
EXPECT_NE(child_sampler3, nullptr);
|
||||
sampler->AddChild(child_sampler3);
|
||||
|
||||
// Create an ImageFolder Dataset
|
||||
std::string folder_path = datasets_root_path_ + "/testPK/data/";
|
||||
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, false, sampler);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
}
|
||||
EXPECT_EQ(i, 10);
|
||||
|
||||
EXPECT_EQ(ds->GetDatasetSize(), 10);
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSubsetSamplerSuccess1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSubsetSamplerSuccess1.";
|
||||
// Test basic setting of subset_sampler with default num_samples
|
||||
|
@ -553,7 +715,6 @@ TEST_F(MindDataTestPipeline, TestSubsetSamplerFail) {
|
|||
TEST_F(MindDataTestPipeline, TestPKSamplerImageFolder) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestPKSamplerImageFolder.";
|
||||
|
||||
|
||||
std::shared_ptr<Sampler> sampler = std::make_shared<PKSampler>(3, false);
|
||||
EXPECT_NE(sampler, nullptr);
|
||||
|
||||
|
|
|
@ -136,23 +136,22 @@ def test_numpyslices_sampler_chain_multi_add_child():
|
|||
"""
|
||||
Feature: Chained Sampler
|
||||
Description: NumpySlicesDataset with sampler chain with multiple add_child() invocations
|
||||
Expectation: Data verified to be correct. Only last add_child() invocation is effective.
|
||||
Expectation: Data verified to be correct. A subsequent add_child() invocation replaces the prior
|
||||
child sampler (if any).
|
||||
"""
|
||||
logger.info("test_numpyslices_sampler_chain_multi_add_child")
|
||||
|
||||
# Create NumpySlicesDataset with sampler chain
|
||||
# Call add_child() multiple times in succession
|
||||
# Note: A subsequent add_child() invocation replaces the prior child sampler (if any).
|
||||
np_data = [1, 2, 3, 4, 5, 6, 7, 8]
|
||||
sampler = ds.SequentialSampler(start_index=1, num_samples=None)
|
||||
# 1st add_child invocation
|
||||
sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=1))
|
||||
# 2nd add_child invocation
|
||||
sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=2))
|
||||
# 3rd add_child invocation
|
||||
sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=3))
|
||||
# 4th and last add_child invocation which is the effective child sampler
|
||||
sampler.add_child(ds.SequentialSampler(start_index=1, num_samples=6))
|
||||
# Expect the second child will fail
|
||||
with pytest.raises(RuntimeError) as info:
|
||||
sampler.add_child(ds.SequentialSampler(start_index=4, num_samples=2))
|
||||
|
||||
error_msg = "Cannot add child sampler, this sampler already has a child."
|
||||
assert error_msg in str(info.value)
|
||||
|
||||
data1 = ds.NumpySlicesDataset(np_data, sampler=sampler)
|
||||
|
||||
|
@ -281,13 +280,14 @@ def test_coco_sampler_chain():
|
|||
|
||||
def test_cifar_sampler_chain():
|
||||
"""
|
||||
Test Cifar sampler chain
|
||||
Test Cifar sampler chain, including nested child sampler
|
||||
"""
|
||||
logger.info("test_cifar_sampler_chain")
|
||||
|
||||
sampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5)
|
||||
child_sampler = ds.RandomSampler(replacement=True, num_samples=4)
|
||||
child_sampler2 = ds.SequentialSampler(start_index=0, num_samples=2)
|
||||
# Note: Add nested child_sampler2 to child_sampler
|
||||
child_sampler.add_child(child_sampler2)
|
||||
sampler.add_child(child_sampler)
|
||||
data1 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, sampler=sampler)
|
||||
|
|
Loading…
Reference in New Issue