From ce88d1dd91ae3099efcbe359e8250d3edcdecc85 Mon Sep 17 00:00:00 2001 From: luoyang Date: Mon, 11 Oct 2021 11:39:44 +0800 Subject: [PATCH] Add examples for MindData C++ API - stage 1 --- .../dataset/include/dataset/datasets.h | 496 ++++++++++++++++++ .../dataset/include/dataset/execute.h | 23 + .../dataset/include/dataset/iterator.h | 28 + .../dataset/include/dataset/samplers.h | 44 ++ .../minddata/dataset/include/dataset/vision.h | 2 +- 5 files changed, 592 insertions(+), 1 deletion(-) diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h b/mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h index 423ec07ed08..9ec1dd25d06 100644 --- a/mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h @@ -134,11 +134,24 @@ class Dataset : public std::enable_shared_from_this { /// \brief Function to set runtime number of workers. /// \param[in] num_workers The number of threads in this operator. /// \return Shared pointer to the original object. + /// \par Example + /// \code + /// /* Set number of workers(threads) to process the dataset in parallel */ + /// std::shared_ptr ds = ImageFolder(folder_path, true); + /// ds = ds->SetNumWorkers(16); + /// \endcode std::shared_ptr SetNumWorkers(int32_t num_workers); /// \brief A Function to create an PullBasedIterator over the Dataset. /// \param[in] columns List of columns to be used to specify the order of columns. /// \return Shared pointer to the Iterator. + /// \par Example + /// \code + /// /* dataset is an instance of Dataset object */ + /// std::shared_ptr = dataset->CreatePullBasedIterator(); + /// std::unordered_map row; + /// iter->GetNextRow(&row); + /// \endcode std::shared_ptr CreatePullBasedIterator(std::vector> columns = {}); /// \brief Function to create an Iterator over the Dataset pipeline. @@ -146,6 +159,13 @@ class Dataset : public std::enable_shared_from_this { /// \param[in] num_epochs Number of epochs to run through the pipeline (default=-1, which means infinite epochs). /// An empty row is returned at the end of each epoch. /// \return Shared pointer to the Iterator. + /// \par Example + /// \code + /// /* dataset is an instance of Dataset object */ + /// std::shared_ptr = dataset->CreateIterator(); + /// std::unordered_map row; + /// iter->GetNextRow(&row); + /// \endcode std::shared_ptr CreateIterator(std::vector columns = {}, int32_t num_epochs = -1) { return CreateIteratorCharIF(VectorStringToChar(columns), num_epochs); } @@ -181,6 +201,14 @@ class Dataset : public std::enable_shared_from_this { /// \param[in] num_files Number of dataset files (default=1). /// \param[in] dataset_type Dataset format (default="mindrecord"). /// \return Returns true if no error encountered else false. + /// \par Example + /// \code + /// /* Create a dataset and save its data into MindRecord */ + /// std::string folder_path = "/path/to/cifar_dataset"; + /// std::shared_ptr ds = Cifar10(folder_path, "all", std::make_shared(0, 10)); + /// std::string save_file = "Cifar10Data.mindrecord"; + /// bool rc = ds->Save(save_file); + /// \endcode bool Save(std::string dataset_path, int32_t num_files = 1, std::string dataset_type = "mindrecord") { return SaveCharIF(StringToChar(dataset_path), num_files, StringToChar(dataset_type)); } @@ -193,6 +221,12 @@ class Dataset : public std::enable_shared_from_this { /// available to make the last batch, then those rows will /// be dropped and not propagated to the next node. /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Create a dataset where every 100 rows is combined into a batch */ + /// std::shared_ptr ds = ImageFolder(folder_path, true); + /// ds = ds->Batch(100, true); + /// \endcode std::shared_ptr Batch(int32_t batch_size, bool drop_remainder = false); /// \brief Function to create a BucketBatchByLengthDataset. @@ -221,6 +255,12 @@ class Dataset : public std::enable_shared_from_this { /// \param[in] drop_remainder If true, will drop the last batch for each bucket if it is not a full batch /// (default=false). /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Bucket elements according to their lengths */ + /// std::shared_ptr ds = Mnist(folder_path, "all", std::make_shared(false, 10)); + /// ds = ds->BucketBatchByLength({"image"}, {1, 2, 3}, {4, 5, 6, 7}); + /// \endcode std::shared_ptr BucketBatchByLength( const std::vector &column_names, const std::vector &bucket_boundaries, const std::vector &bucket_batch_sizes, @@ -243,6 +283,14 @@ class Dataset : public std::enable_shared_from_this { /// The input sentence must be pretokenized when using word type. /// \param[in] params A vector contains more option parameters of sentencepiece library. /// \return Shared pointer to the SentencePieceVocab. + /// \par Example + /// \code + /// /* Build a SentencePieceVocab from TextFile dataset */ + /// std::string vocab_file = "/path/to/txtfile"; + /// std::shared_ptr ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse); + /// std::shared_ptr vocab = + /// ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {}); + /// \endcode std::shared_ptr BuildSentencePieceVocab( const std::vector &col_names, int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms) { @@ -263,6 +311,13 @@ class Dataset : public std::enable_shared_from_this { /// \param[in] special_first Whether special_tokens will be prepended/appended to vocab, If special_tokens /// is specified and special_first is set to default, special_tokens will be prepended. /// \return Shared pointer to the Vocab. + /// \par Example + /// \code + /// /* Build a Vocab from TextFile dataset */ + /// std::string vocab_file = "/path/to/txtfile"; + /// std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + /// std::shared_ptr vocab = ds->BuildVocab(); + /// \endcode std::shared_ptr BuildVocab(const std::vector &columns = {}, const std::pair &freq_range = {0, kDeMaxFreq}, int64_t top_k = kDeMaxTopk, const std::vector &special_tokens = {}, @@ -275,6 +330,13 @@ class Dataset : public std::enable_shared_from_this { /// \note Concat the datasets in the input. /// \param[in] datasets List of shared pointers to the dataset that should be concatenated together. /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Create a dataset by concatenating dataset_1 and dataset_2 with "+" operator */ + /// std::shared_ptr dataset = dataset_1 + dataset_2; + /// /* Create a dataset by concatenating dataset_1 and dataset_2 with concat operation */ + /// std::shared_ptr dataset = dataset_1->Concat({dataset_2}); + /// \endcode std::shared_ptr Concat(const std::vector> &datasets) { std::vector> all_datasets{shared_from_this()}; all_datasets.insert(std::end(all_datasets), std::begin(datasets), std::end(datasets)); @@ -286,6 +348,28 @@ class Dataset : public std::enable_shared_from_this { /// \param[in] predicate Function callable which returns a boolean value. If false then filter the element. /// \param[in] input_columns List of names of the input columns to filter. /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Define a predicate function */ + /// MSTensorVec Predicate1(MSTensorVec in) { + /// // Return true if input is equal to 3 + /// uint64_t input_value; + /// TensorRow input = VecToRow(in); + /// (void)input.at(0)->GetItemAt(&input_value, {0}); + /// bool result = (input_value == 3); + /// // Convert from boolean to TensorRow + /// TensorRow output; + /// std::shared_ptr out; + /// (void)Tensor::CreateEmpty(mindspore::dataset::TensorShape({}), + /// mindspore::dataset::DataType(mindspore::dataset::DataType::Type::DE_BOOL), &out); + /// (void)out->SetItemAt({}, result); + /// output.push_back(out); + /// return RowToVec(output); + /// } + /// + /// /* Apply predicate function for datase */ + /// std::shared_ptr ds = ds->Filter(Predicate1, {"label"}); + /// \endcode std::shared_ptr Filter(std::function predicate, const std::vector &input_columns = {}) { return std::make_shared(shared_from_this(), predicate, VectorStringToChar(input_columns)); @@ -308,6 +392,40 @@ class Dataset : public std::enable_shared_from_this { /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \param[in] callbacks List of Dataset callbacks to be called. /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// // Create objects for the tensor ops + /// std::shared_ptr decode_op = std::make_shared(true); + /// std::shared_ptr random_color_op = std::make_shared(0.0, 0.0); + /// + /// /* 1) Simple map example */ + /// // Apply decode_op on column "image". This column will be replaced by the outputted + /// // column of decode_op. Since column_order is not provided, both columns "image" + /// // and "label" will be propagated to the child node in their original order. + /// dataset = dataset->Map({decode_op}, {"image"}); + /// + /// // Decode and rename column "image" to "decoded_image". + /// dataset = dataset->Map({decode_op}, {"image"}, {"decoded_image"}); + /// + /// // Specify the order of the output columns. + /// dataset = dataset->Map({decode_op}, {"image"}, {}, {"label", "image"}); + /// + /// // Rename column "image" to "decoded_image" and also specify the order of the output columns. + /// dataset = dataset->Map({decode_op}, {"image"}, {"decoded_image"}, {"label", "decoded_image"}); + /// + /// // Rename column "image" to "decoded_image" and keep only this column. + /// dataset = dataset->Map({decode_op}, {"image"}, {"decoded_image"}, {"decoded_image"}); + /// + /// /* 2) Map example with more than one operation */ + /// // Create a dataset where the images are decoded, then randomly color jittered. + /// // decode_op takes column "image" as input and outputs one column. The column + /// // outputted by decode_op is passed as input to random_jitter_op. + /// // random_jitter_op will output one column. Column "image" will be replaced by + /// // the column outputted by random_jitter_op (the very last operation). All other + /// // columns are unchanged. Since column_order is not specified, the order of the + /// // columns will remain the same. + /// dataset = dataset->Map({decode_op, random_jitter_op}, {"image"}) + /// \endcode std::shared_ptr Map(std::vector operations, const std::vector &input_columns = {}, const std::vector &output_columns = {}, @@ -391,6 +509,12 @@ class Dataset : public std::enable_shared_from_this { /// \note Applies project to the dataset. /// \param[in] columns The name of columns to project. /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Reorder the original column names in dataset */ + /// std::shared_ptr ds = Mnist(folder_path, "all", std::make_shared(false, 10)); + /// ds = ds->Project({"label", "image"}); + /// \endcode std::shared_ptr Project(const std::vector &columns) { return std::make_shared(shared_from_this(), VectorStringToChar(columns)); } @@ -400,6 +524,12 @@ class Dataset : public std::enable_shared_from_this { /// \param[in] input_columns List of the input columns to rename. /// \param[in] output_columns List of the output columns. /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Rename the original column names in dataset */ + /// std::shared_ptr ds = Mnist(folder_path, "all", std::make_shared(false, 10)); + /// ds = ds->Rename({"image", "label"}, {"image_output", "label_output"}); + /// \endcode std::shared_ptr Rename(const std::vector &input_columns, const std::vector &output_columns) { return std::make_shared(shared_from_this(), VectorStringToChar(input_columns), @@ -409,6 +539,12 @@ class Dataset : public std::enable_shared_from_this { /// \note Repeats this dataset count times. Repeat indefinitely if count is -1. /// \param[in] count Number of times the dataset should be repeated. /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Create a dataset where the dataset is repeated for 50 epochs */ + /// std::shared_ptr ds = Mnist(folder_path, "all", std::make_shared(false, 10)); + /// ds = ds->Repeat(50); + /// \endcode std::shared_ptr Repeat(int32_t count = -1) { return std::make_shared(shared_from_this(), count); } @@ -416,6 +552,12 @@ class Dataset : public std::enable_shared_from_this { /// \note Randomly shuffles the rows of this dataset. /// \param[in] buffer_size The size of the buffer (must be larger than 1) for shuffling /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Create a shuffled dataset using a shuffle buffer of size 4 */ + /// std::shared_ptr ds = Mnist(folder_path, "all", std::make_shared(false, 10)); + /// ds = ds->Shuffle(4); + /// \endcode std::shared_ptr Shuffle(int32_t buffer_size) { return std::make_shared(shared_from_this(), buffer_size); } @@ -424,12 +566,24 @@ class Dataset : public std::enable_shared_from_this { /// \note Skips count elements in this dataset. /// \param[in] count Number of elements the dataset to be skipped. /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Create a dataset which skips first 3 elements from data */ + /// std::shared_ptr ds = Mnist(folder_path, "all", std::make_shared(false, 10)); + /// ds = ds->Skip(3); + /// \endcode std::shared_ptr Skip(int32_t count) { return std::make_shared(shared_from_this(), count); } /// \brief Function to create a TakeDataset. /// \note Takes count elements in this dataset. /// \param[in] count Number of elements the dataset to be taken. /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Create a dataset where the dataset includes 50 elements. */ + /// std::shared_ptr ds = Mnist(folder_path, "all", std::make_shared(false, 10)); + /// ds = ds->Take(50); + /// \endcode std::shared_ptr Take(int32_t count = -1) { return std::make_shared(shared_from_this(), count); } @@ -438,6 +592,13 @@ class Dataset : public std::enable_shared_from_this { /// \note Applies zip to the dataset. /// \param[in] datasets A list of shared pointers to the datasets that we want to zip. /// \return Shared pointer to the current Dataset. + /// \par Example + /// \code + /// /* Create a dataset which is the combination of dataset and dataset_1 */ + /// std::shared_ptr ds1 = ImageFolder(folder_path, true, std::make_shared(false, 10)); + /// std::shared_ptr ds2 = Cifar10(folder_path, "all", std::make_shared(false, 10)); + /// std::shared_ptr ds = ds->Zip({ds1, ds2}); + /// \endcode std::shared_ptr Zip(const std::vector> &datasets) { std::vector> all_datasets = datasets; all_datasets.push_back(shared_from_this()); @@ -880,6 +1041,22 @@ class AlbumDataset : public Dataset { /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the AlbumDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/album_dataset_directory"; +/// std::string schema_file = "/path/to/album_schema_file"; +/// std::vector column_names = {"image", "label", "id"}; +/// std::shared_ptr ds = Album(folder_path, schema_file, column_names); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: As we defined before, each data dictionary owns keys "image", "label" and "id" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, const std::vector &column_names = {}, bool decode = false, const std::shared_ptr &sampler = std::make_shared(), @@ -972,6 +1149,20 @@ class CelebADataset : public Dataset { /// \param[in] extensions Set of file extensions to be included in the dataset (default={}). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the CelebADataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/celeba_dataset_directory"; +/// std::shared_ptr ds = CelebA(folder_path, "all", std::make_shared(0, 5)); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In CelebA dataset, each data dictionary owns keys "image" and "attr" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr CelebA( const std::string &dataset_dir, const std::string &usage = "all", const std::shared_ptr &sampler = std::make_shared(), bool decode = false, @@ -1057,6 +1248,20 @@ class Cifar10Dataset : public Dataset { /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the Cifar10Dataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/cifar10_dataset_directory"; +/// std::shared_ptr ds = Cifar10(folder_path, "all", std::make_shared(false, 10)); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In CIFAR10 dataset, each data dictionary owns keys "image" and "label" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr Cifar10( const std::string &dataset_dir, const std::string &usage = "all", const std::shared_ptr &sampler = std::make_shared(), @@ -1131,6 +1336,20 @@ class Cifar100Dataset : public Dataset { /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the Cifar100Dataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/cifar100_dataset_directory"; +/// std::shared_ptr ds = Cifar100(folder_path, "all", std::make_shared()); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In CIFAR100 dataset, each dictionary has 3 keys: "image", "fine_label" and "coarse_label" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr Cifar100( const std::string &dataset_dir, const std::string &usage = "all", const std::shared_ptr &sampler = std::make_shared(), @@ -1232,6 +1451,20 @@ class CityscapesDataset : public Dataset { /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). /// \return Shared pointer to the current CityscapesDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/cityscapes_dataset_directory"; +/// std::shared_ptr ds = Cityscapes(dataset_path, "train", "fine", "color"); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In Cityscapes dataset, each data dictionary owns keys "image" and "task" */ +/// auto task = row["task"]; +/// \endcode inline std::shared_ptr Cityscapes( const std::string &dataset_dir, const std::string &usage, const std::string &quality_mode, const std::string &task, bool decode = false, const std::shared_ptr &sampler = std::make_shared(), @@ -1328,6 +1561,19 @@ class CLUEDataset : public Dataset { /// specified only when num_shards is also specified (Default = 0). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the CLUEDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string train_file = "/path/to/clue_dataset_file"; +/// std::shared_ptr ds = CLUE({train_file}, "AFQMC", "train", 0, ShuffleMode::kFalse); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// auto text = row["sentence1"]; +/// \endcode inline std::shared_ptr CLUE(const std::vector &dataset_files, const std::string &task = "AFQMC", const std::string &usage = "train", int64_t num_samples = 0, ShuffleMode shuffle = ShuffleMode::kGlobal, @@ -1400,6 +1646,21 @@ class CocoDataset : public Dataset { /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \param[in] extra_metadata Flag to add extra meta-data to row. (default=false). /// \return Shared pointer to the CocoDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/coco_dataset_directory"; +/// std::string annotation_file = "/path/to/annotation_file"; +/// std::shared_ptr ds = Coco(folder_path, annotation_file); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In COCO dataset, each dictionary has keys "image" and "annotation" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task = "Detection", const bool &decode = false, const std::shared_ptr &sampler = std::make_shared(), @@ -1513,6 +1774,21 @@ class CSVDataset : public Dataset { /// specified only when num_shards is also specified (Default = 0). /// \param[in] cache Tensor cache to use.(default=nullptr which means no cache is used). /// \return Shared pointer to the CSVDataset +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string train_file = "/path/to/csv_file"; +/// std::vector column_names = {"col1", "col2", "col3", "col4"}; +/// std::shared_ptr ds = CSV({train_file}, ',', {}, column_names, 0, ShuffleMode::kFalse); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: As we defined before, the dataset has column "col1", "col2", "col3" and "col4" */ +/// auto col1 = row["col1"]; +/// \endcode inline std::shared_ptr CSV(const std::vector &dataset_files, char field_delim = ',', const std::vector> &column_defaults = {}, const std::vector &column_names = {}, int64_t num_samples = 0, @@ -1582,6 +1858,20 @@ class DIV2KDataset : public Dataset { /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the current DIV2KDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string dataset_path = "/path/to/div2k_dataset_directory"; +/// std::shared_ptr ds = DIV2K(dataset_path, "train", "bicubic", 2); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In DIV2K dataset, each dictionary has keys "hr_image" and "lr_image" */ +/// auto hr_image = row["hr_image"]; +/// \endcode inline std::shared_ptr DIV2K(const std::string &dataset_dir, const std::string &usage, const std::string &downgrade, int32_t scale, bool decode = false, const std::shared_ptr &sampler = std::make_shared(), @@ -1677,6 +1967,20 @@ class EMnistDataset : public Dataset { /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). /// \return Shared pointer to the current EMnistDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/emnist_dataset_directory"; +/// std::shared_ptr ds = EMnist(folder_path, "mnist", "train", std::make_shared(false, 5)); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In EMNIST dataset dataset, each dictionary has keys "image" and "label" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr EMnist( const std::string &dataset_dir, const std::string &name, const std::string &usage = "all", const std::shared_ptr &sampler = std::make_shared(), @@ -1760,6 +2064,21 @@ class FlickrDataset : public Dataset { /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). /// \return Shared pointer to the current FlickrDataset +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string dataset_path = "/path/to/flickr30k_dataset_directory"; +/// std::string file_path = "/path/to/token_file"; +/// std::shared_ptr ds = Flickr(dataset_path, file_path); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In FLICKR dataset, each dictionary has keys "image" and "annotation" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr Flickr( const std::string &dataset_dir, const std::string &annotation_file, bool decode = false, const std::shared_ptr &sampler = std::make_shared(), @@ -1855,6 +2174,20 @@ class ImageFolderDataset : public Dataset { /// \param[in] class_indexing a class name to label map. /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the ImageFolderDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string dataset_path = "/path/to/image_directory"; +/// std::shared_ptr ds = ImageFolder(folder_path, true, std::make_shared(false, 10)); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In ImageFolder dataset, each data dictionary has keys "image" and "label" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr ImageFolder( const std::string &dataset_dir, bool decode = false, const std::shared_ptr &sampler = std::make_shared(), @@ -1961,6 +2294,20 @@ class ManifestDataset : public Dataset { /// \param[in] decode Decode the images after reading (default=false). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the ManifestDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string dataset_path = "/path/to/manifest_file"; +/// std::shared_ptr ds = Manifest(file_path); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In Manifest dataset, each data dictionary has keys "image" and "label" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr Manifest( const std::string &dataset_file, const std::string &usage = "train", const std::shared_ptr &sampler = std::make_shared(), @@ -2154,6 +2501,21 @@ class MindDataDataset : public Dataset { /// ShuffleMode::kInfile - Shuffle samples in file. /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the current MindDataDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string dataset_path = "/path/to/mindrecord_file"; +/// std::vector column_names = {"data", "file_name", "label"}; +/// std::shared_ptr ds = MindData(file_path, column_names); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: As we defined before, each data dictionary owns keys "data", "file_name" and "label" */ +/// auto data = row["data"]; +/// \endcode inline std::shared_ptr MindData( const std::string &dataset_file, const std::vector &columns_list = {}, const std::shared_ptr &sampler = std::make_shared(), nlohmann::json *padded_sample = nullptr, @@ -2229,6 +2591,23 @@ inline std::shared_ptr MindData(const std::string &dataset_file /// ShuffleMode::kInfile - Shuffle samples in file. /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the MindDataDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string file_path1 = "/path/to/mindrecord_file1"; +/// std::string file_path2 = "/path/to/mindrecord_file2"; +/// std::string file_list = {file_path1, file_path2}; +/// std::vector column_names = {"data", "file_name", "label"}; +/// std::shared_ptr ds = MindData(file_list, column_names); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: As we defined before, each data dictionary owns keys "data", "file_name" and "label" */ +/// auto data = row["data"]; +/// \endcode inline std::shared_ptr MindData( const std::vector &dataset_files, const std::vector &columns_list = {}, const std::shared_ptr &sampler = std::make_shared(), nlohmann::json *padded_sample = nullptr, @@ -2328,6 +2707,20 @@ class MnistDataset : public Dataset { /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the MnistDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/mnist_dataset_directory"; +/// std::shared_ptr ds = Mnist(folder_path, "all", std::make_shared(false, 20)); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In MNIST dataset, each dictionary has keys "image" and "label" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr Mnist(const std::string &dataset_dir, const std::string &usage = "all", const std::shared_ptr &sampler = std::make_shared(), const std::shared_ptr &cache = nullptr) { @@ -2409,6 +2802,20 @@ class QMnistDataset : public Dataset { /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the QMnistDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/qmnist_dataset_directory"; +/// std::shared_ptr ds = QMnist(folder_path, "train", true, std::make_shared(false, 5)); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In QMNIST dataset, each dictionary has keys "image" and "label" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr QMnist( const std::string &dataset_dir, const std::string &usage = "all", bool compat = true, const std::shared_ptr &sampler = std::make_shared(), @@ -2484,6 +2891,22 @@ class RandomDataDataset : public Dataset { /// \param[in] columns_list List of columns to be read (default={}, read all columns). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the RandomDataset. +/// \par Example +/// \code +/// /* Define MindData objects */ +/// std::shared_ptr schema = Schema(); +/// schema->add_column("column1", mindspore::DataType::kNumberTypeUInt8, {2}); +/// schema->add_column("column2", mindspore::DataType::kNumberTypeUInt8, {1}); +/// std::shared_ptr ds = RandomData(50, schema); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: As we defined the schema before, each data dictionary owns keys "column1" and "column2" */ +/// auto column1 = row["column1"]; +/// \endcode template > std::shared_ptr RandomData(const int32_t &total_rows = 0, const T &schema = nullptr, const std::vector &columns_list = {}, @@ -2540,6 +2963,20 @@ class SBUDataset : public Dataset { /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the current SBUDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/sbu_dataset_directory"; +/// std::shared_ptr ds = SBU(folder_path, true, std::make_shared(false, 5)); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In SBU dataset, each dictionary has keys "image" and "caption" */ +/// auto caption = row["caption"]; +/// \endcode inline std::shared_ptr SBU(const std::string &dataset_dir, bool decode = false, const std::shared_ptr &sampler = std::make_shared(), const std::shared_ptr &cache = nullptr) { @@ -2612,6 +3049,20 @@ class TextFileDataset : public Dataset { /// specified only when num_shards is also specified (Default = 0). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the TextFileDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string file_path = "/path/to/text_file_dataset_file"; +/// std::shared_ptr ds = TextFile({file_path}, 2); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In TextFile dataset, each dictionary has key "text" */ +/// auto text = row["text"]; +/// \endcode inline std::shared_ptr TextFile(const std::vector &dataset_files, int64_t num_samples = 0, ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, @@ -2702,6 +3153,21 @@ class TFRecordDataset : public Dataset { /// each shard may be not equal). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the TFRecordDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string file_path = "/path/to/tfrecord_file"; +/// std::string schema_path = "/path/to/schema_file"; +/// std::shared_ptr ds = TFRecord({file_path}, schema_path, {"image"}); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: The columns of generated dataset depend on the source TFRecord files. */ +/// auto image = row["image"]; +/// \endcode template > std::shared_ptr TFRecord(const std::vector &dataset_files, const T &schema = nullptr, const std::vector &columns_list = {}, int64_t num_samples = 0, @@ -2770,6 +3236,20 @@ class USPSDataset : public Dataset { /// specified only when num_shards is also specified (Default = 0). /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \return Shared pointer to the current USPSDataset. +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/usps_dataset_directory"; +/// std::shared_ptr ds = USPS(folder_path, "train"); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In USPS dataset, each dictionary has keys "image" and "label" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr USPS(const std::string &dataset_dir, const std::string &usage = "all", int64_t num_samples = 0, ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, @@ -2842,6 +3322,22 @@ class VOCDataset : public Dataset { /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). /// \param[in] extra_metadata Flag to add extra meta-data to row (default=false). /// \return Shared pointer to the VOCDataset +/// \par Example +/// \code +/// /* Define dataset path and MindData object */ +/// std::string folder_path = "/path/to/voc_dataset_directory"; +/// std::shared_ptr ds = VOC(folder_path, "Detection", "train", class_index, false, +/// std::make_shared(0, 6)); +/// +/// /* Create iterator to read dataset */ +/// std::shared_ptr iter = ds->CreateIterator(); +/// std::unordered_map row; +/// iter->GetNextRow(&row); +/// +/// /* Note: In VOC dataset, if task='Segmentation', each dictionary has keys "image" and "target" */ +/// /* Note: In VOC dataset, if task='Detection', each dictionary has keys "image" and "annotation" */ +/// auto image = row["image"]; +/// \endcode inline std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task = "Segmentation", const std::string &usage = "train", const std::map &class_indexing = {}, bool decode = false, diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/execute.h b/mindspore/ccsrc/minddata/dataset/include/dataset/execute.h index 73f91be5feb..74a59a3f8b5 100644 --- a/mindspore/ccsrc/minddata/dataset/include/dataset/execute.h +++ b/mindspore/ccsrc/minddata/dataset/include/dataset/execute.h @@ -98,12 +98,35 @@ class Execute { /// \param[in] input Tensor to be transformed. /// \param[out] output Transformed tensor. /// \return Status error code, returns OK if no error encountered. + /// \par Example + /// \code + /// /* Usage of Execute */ + /// std::shared_ptr decode = std::make_shared(); + /// std::shared_ptr center_crop(new vision::CenterCrop({30})); + /// std::shared_ptr rescale = std::make_shared(1. / 3, 0.5); + /// mindspore::dataset::Execute transform = Execute({decode, center_crop, rescale}); + /// + /// /* Apply transforms */ + /// mindspore::MSTensor image = ReadFileToTensor("apple.jpg"); + /// Status rc = transform(image, &image); + /// \endcode Status operator()(const mindspore::MSTensor &input, mindspore::MSTensor *output); /// \brief Callable function to execute the TensorTransform in eager mode. /// \param[in] input_tensor_list List of Tensor to be transformed. /// \param[out] out Result tensor after transform. /// \return Status error code, returns OK if no error encountered. + /// \par Example + /// \code + /// /* Usage of Execute */ + /// auto tokenizer = text::BasicTokenizer(); + /// mindspore::dataset::Execute transform = Execute({tokenizer}); + /// + /// /* Apply transforms */ + /// std::vector txt = ReadTextToTensor("demo.txt"); + /// std::vector txt_result; + /// Status rc = transform1({txt}, &txt_result); + /// \endcode Status operator()(const std::vector &input_tensor_list, std::vector *out); /// \brief Given a set of Executes, run them diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/iterator.h b/mindspore/ccsrc/minddata/dataset/include/dataset/iterator.h index 3af62c170f2..d9aa4ab95da 100644 --- a/mindspore/ccsrc/minddata/dataset/include/dataset/iterator.h +++ b/mindspore/ccsrc/minddata/dataset/include/dataset/iterator.h @@ -63,6 +63,13 @@ class Iterator { /// \note Type of return data is a unordered_map(with column name). /// \param[out] row The output tensor row. /// \return Status error code, returns OK if no error encountered. + /// \par Example + /// \code + /// /* dataset is an instance of Dataset object */ + /// std::shared_ptr = dataset->CreateIterator(); + /// std::unordered_map row; + /// iter->GetNextRow(&row); + /// \endcode Status GetNextRow(MSTensorMap *row) { if (row == nullptr) { return Status(kMDUnexpectedError, "Got nullptr when GetNext row."); @@ -84,6 +91,13 @@ class Iterator { /// \note Type of return data is a vector(without column name). /// \param[out] row The output tensor row. /// \return Status error code, returns OK if no error encountered. + /// \par Example + /// \code + /// /* dataset is an instance of Dataset object */ + /// std::shared_ptr = dataset->CreateIterator(); + /// std::vector row; + /// iter->GetNextRow(&row); + /// \endcode virtual Status GetNextRow(MSTensorVec *row); /// \brief Function to shut down the data pipeline. @@ -144,6 +158,13 @@ class PullIterator : public Iterator { /// \note Type of return data is a vector(without column name). /// \param[out] row The output tensor row. /// \return Status error code, returns OK if no error encountered else false. + /// \par Example + /// \code + /// /* dataset is an instance of Dataset object */ + /// std::shared_ptr = dataset->CreatePullBasedIterator(); + /// std::vector row; + /// iter->GetNextRow(&row); + /// \endcode Status GetNextRow(MSTensorVec *const row) override; /// \brief Function to get specified rows from the data pipeline. @@ -151,6 +172,13 @@ class PullIterator : public Iterator { /// \param[in] num_rows The number of rows to fetch. /// \param[out] row The output tensor row. /// \return Status error code, returns OK if no error encountered else false. + /// \par Example + /// \code + /// /* dataset is an instance of Dataset object */ + /// std::shared_ptr = dataset->CreatePullBasedIterator(); + /// std::vector> rows; + /// iter->GetNextRow(5, &rows); + /// \endcode Status GetRows(int32_t num_rows, std::vector *const row); /// \brief Method for building and launching the pipeline. diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h b/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h index fd48b5a2ec7..8ebaefdecf3 100644 --- a/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h +++ b/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h @@ -88,6 +88,12 @@ class DistributedSampler final : public Sampler { /// \param[in] offset The starting position where access to elements in the dataset begins (default=-1). /// \param[in] even_dist If true, each shard would return the same number of rows (default=true). /// If false the total rows returned by all the shards would not have overlap. + /// \par Example + /// \code + /// /* creates a distributed sampler with 2 shards in total. This shard is shard 0 */ + /// std::string file_path = "/path/to/test.mindrecord"; + /// std::shared_ptr ds = MindData(file_path, {}, std::make_shared(2, 0, false)); + /// \endcode DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle = true, int64_t num_samples = 0, uint32_t seed = 1, int64_t offset = -1, bool even_dist = true); /// \brief Destructor. @@ -119,6 +125,12 @@ class PKSampler final : public Sampler { /// \param[in] num_val Number of elements to sample for each class. /// \param[in] shuffle If true, the class IDs are shuffled (default=false). /// \param[in] num_samples The number of samples to draw (default=0, return all samples). + /// \par Example + /// \code + /// /* creates a PKSampler that will get 3 samples from every class. */ + /// std::string folder_path = "/path/to/image/folder"; + /// std::shared_ptr ds = ImageFolder(folder_path, true, std::make_shared(3)); + /// \endcode explicit PKSampler(int64_t num_val, bool shuffle = false, int64_t num_samples = 0); /// \brief Destructor. @@ -144,6 +156,12 @@ class RandomSampler final : public Sampler { /// \brief Constructor /// \param[in] replacement If true, put the sample ID back for the next draw (default=false). /// \param[in] num_samples The number of samples to draw (default=0, return all samples). + /// \par Example + /// \code + /// /* creates a RandomSampler that will get 10 samples randomly */ + /// std::string folder_path = "/path/to/image/folder"; + /// std::shared_ptr ds = ImageFolder(folder_path, true, std::make_shared(false, 10)); + /// \endcode explicit RandomSampler(bool replacement = false, int64_t num_samples = 0); /// \brief Destructor. @@ -168,6 +186,12 @@ class SequentialSampler final : public Sampler { /// \brief Constructor /// \param[in] start_index Index to start sampling at (default=0, start at first id). /// \param[in] num_samples The number of samples to draw (default=0, return all samples). + /// \par Example + /// \code + /// /* creates a SequentialSampler that will get 2 samples sequentially in original dataset */ + /// std::string folder_path = "/path/to/image/folder"; + /// std::shared_ptr ds = ImageFolder(folder_path, false, std::make_shared(0, 2)); + /// \endcode explicit SequentialSampler(int64_t start_index = 0, int64_t num_samples = 0); /// \brief Destructor. @@ -192,6 +216,12 @@ class SubsetSampler : public Sampler { /// \brief Constructor /// \param[in] indices A vector sequence of indices. /// \param[in] num_samples The number of samples to draw (default=0, return all samples). + /// \par Example + /// \code + /// /* creates a SubsetSampler, will sample from the provided indices */ + /// std::string folder_path = "/path/to/image/folder"; + /// std::shared_ptr ds = ImageFolder(folder_path, false, std::make_shared({0, 2, 5})); + /// \endcode explicit SubsetSampler(std::vector indices, int64_t num_samples = 0); /// \brief Destructor. @@ -215,6 +245,12 @@ class SubsetRandomSampler final : public SubsetSampler { /// \brief Constructor /// \param[in] indices A vector sequence of indices. /// \param[in] num_samples The number of samples to draw (default=0, return all samples). + /// \par Example + /// \code + /// /* create a SubsetRandomSampler, will sample from the provided indices */ + /// std::string folder_path = "/path/to/image/folder"; + /// std::shared_ptr ds = ImageFolder(folder_path, false, std::make_shared({2, 7})); + /// \endcode explicit SubsetRandomSampler(std::vector indices, int64_t num_samples = 0); /// \brief Destructor. @@ -237,6 +273,14 @@ class WeightedRandomSampler final : public Sampler { /// \param[in] weights A vector sequence of weights, not necessarily summing up to 1. /// \param[in] num_samples The number of samples to draw (default=0, return all samples). /// \param[in] replacement If true, put the sample ID back for the next draw (default=true). + /// \par Example + /// \code + /// /* creates a WeightedRandomSampler that will sample 4 elements without replacement */ + /// std::vector weights = {0.9, 0.8, 0.68, 0.7, 0.71, 0.6, 0.5, 0.4, 0.3, 0.5, 0.2, 0.1}; + /// sampler = std::make_shared(weights, 4); + /// std::string folder_path = "/path/to/image/folder"; + /// std::shared_ptr ds = ImageFolder(folder_path, false, sampler); + /// \endcode explicit WeightedRandomSampler(std::vector weights, int64_t num_samples = 0, bool replacement = true); /// \brief Destructor. diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/vision.h b/mindspore/ccsrc/minddata/dataset/include/dataset/vision.h index 26246f11960..eee167391f4 100644 --- a/mindspore/ccsrc/minddata/dataset/include/dataset/vision.h +++ b/mindspore/ccsrc/minddata/dataset/include/dataset/vision.h @@ -38,7 +38,7 @@ class TensorOperation; namespace vision { /// \brief AdjustGamma TensorTransform. -/// \notes Apply gamma correction on input image. +/// \note Apply gamma correction on input image. class AdjustGamma final : public TensorTransform { public: /// \brief Constructor.